From 4c23690f43e51eccf6ce5866ac47adcf39215e4d Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 18 Nov 2025 23:06:21 -0500 Subject: [PATCH 001/249] [Attention] FlashAttention ViT support, make default backend (#28763) Signed-off-by: Matthew Bonanni --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- tests/kernels/attention/test_flash_attn.py | 4 +-- tests/kernels/attention/test_mha_attn.py | 30 +------------------ vllm/platforms/cuda.py | 21 ++++++------- vllm/v1/attention/backends/flash_attn.py | 4 +-- 5 files changed, 15 insertions(+), 46 deletions(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 567c8959f0454..6cc5cda14c525 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 58e0626a692f09241182582659e3bf8f16472659 + GIT_TAG 71bb26f6295449be880344b93b51791cc009237d GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index 6e5468969bf25..26b8c77ab482f 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -13,14 +13,14 @@ from vllm.vllm_flash_attn import ( ) NUM_HEADS = [(4, 4), (8, 2)] -HEAD_SIZES = [128, 256] +HEAD_SIZES = [40, 72, 80, 128, 256] BLOCK_SIZES = [16] DTYPES = [torch.bfloat16] QDTYPES = [None, torch.float8_e4m3fn] # one value large enough to test overflow in index calculation. # one value small enough to test the schema op check NUM_BLOCKS = [32768, 2048] -SOFT_CAPS = [None, 50.0] +SOFT_CAPS = [None] SLIDING_WINDOWS = [None, 256] diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 183bbf3bf4e03..a878ac6396ce5 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -62,38 +62,10 @@ def test_mha_attn_platform(device: str): assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN # Test CUDA with head_size=72 (not divisible by 32) - # - with upstream FA not available - # - should use xformers + # - should use vLLM's FlashAttention with ( patch("vllm.attention.layer.current_platform", CudaPlatform()), patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), - patch( - "vllm.attention.layer.check_upstream_fa_availability", - return_value=False, - ), - ): - attn = MultiHeadAttention(16, 72, scale=1) - assert attn.attn_backend == AttentionBackendEnum.XFORMERS - - # Test CUDA with head_size=72 (not divisible by 32) - # - with upstream FA available - # - should use upstream FA - with ( - patch("vllm.attention.layer.current_platform", CudaPlatform()), - patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()), - patch( - "vllm.attention.layer.check_upstream_fa_availability", return_value=True - ), - patch.dict( - "sys.modules", - { - "flash_attn": type( - "MockFlashAttn", - (), - {"flash_attn_varlen_func": lambda *args, **kwargs: None}, - )() - }, - ), ): attn = MultiHeadAttention(16, 72, scale=1) assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2e4dd8bb808b4..f9bf242b7194e 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -267,24 +267,21 @@ class CudaPlatformBase(Platform): ) -> "AttentionBackendEnum": from vllm.attention.backends.registry import AttentionBackendEnum - # For Blackwell GPUs, force TORCH_SDPA for now. - # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501 - if cls.has_device_capability(100): - return AttentionBackendEnum.TORCH_SDPA - - if dtype not in (torch.float16, torch.bfloat16): - return AttentionBackendEnum.XFORMERS - - if cls.has_device_capability(80): + # Try FlashAttention first + try: backend_class = AttentionBackendEnum.FLASH_ATTN.get_class() if backend_class.supports_head_size( head_size ) and backend_class.supports_dtype(dtype): return AttentionBackendEnum.FLASH_ATTN - else: - return AttentionBackendEnum.XFORMERS + except ImportError: + pass + + if cls.has_device_capability(100): + # xFormers doesn't support Blackwell, fall back to SDPA + # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501 + return AttentionBackendEnum.TORCH_SDPA else: - # Fallback for Volta/Turing GPUs or FA not supported return AttentionBackendEnum.XFORMERS @classmethod diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a5d4435000d4d..fdc99a0df1c8a 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -119,8 +119,8 @@ class FlashAttentionBackend(AttentionBackend): raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") @classmethod - def get_supported_head_sizes(cls) -> list[int]: - return [32, 64, 96, 128, 160, 192, 224, 256] + def supports_head_size(cls, head_size: int) -> bool: + return head_size % 8 == 0 and head_size <= 256 @classmethod def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool: From 468a8d72bac181c1499320478940cec64363e107 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Tue, 18 Nov 2025 21:05:22 -0800 Subject: [PATCH 002/249] [Bugfix] Fix FusedMoEModularKernel for triton backend (#28913) Signed-off-by: Xin Yang --- vllm/model_executor/layers/quantization/mxfp4.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index b95d1a6b3a1f5..66ae2e94c60a5 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -755,8 +755,10 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): self.w13_weight = w13_weight self.w2_weight = w2_weight - layer.w13_weight = Parameter(w13_weight.storage.data, requires_grad=False) - layer.w2_weight = Parameter(w2_weight.storage.data, requires_grad=False) + del layer.w13_weight + del layer.w2_weight + layer.w13_weight = w13_weight + layer.w2_weight = w2_weight else: raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") @@ -1065,8 +1067,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): return triton_kernel_moe_forward( hidden_states=x, - w1=self.w13_weight, - w2=self.w2_weight, + w1=layer.w13_weight, + w2=layer.w2_weight, gating_output=router_logits, topk=top_k, renormalize=renormalize, From 73ff872db0d4e3f5e133d5d2a5307248619d93a6 Mon Sep 17 00:00:00 2001 From: Gleb Kurchanov Date: Wed, 19 Nov 2025 08:21:02 +0300 Subject: [PATCH 003/249] [Bugfix] Fix typo in Qwen3 Next model executor (#28960) Signed-off-by: Gleb Kurchanov --- vllm/model_executor/models/qwen3_next.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 86508a7c64317..0415c8e00fdfa 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -1154,8 +1154,8 @@ class QwenNextMixtureOfExperts(MixtureOfExperts): example_moe = layer.mlp self.moe_layers.append(layer.mlp.experts) - if example_moe is None: - raise RuntimeError("No Qwen3Next layer found in the model.layers.") + if example_moe is None: + raise RuntimeError("No Qwen3Next layer found in the model.layers.") # Set MoE hyperparameters self.num_moe_layers = len(self.moe_layers) From 6a25ea5f0ea193e35b5a83cb0285c48964bc9eb1 Mon Sep 17 00:00:00 2001 From: Uranus <109661872+UranusSeven@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:30:08 +0800 Subject: [PATCH 004/249] [Docs] Update oneshot imports (#28188) Signed-off-by: UranusSeven <109661872+UranusSeven@users.noreply.github.com> --- docs/features/quantization/fp8.md | 2 +- docs/features/quantization/int4.md | 2 +- docs/features/quantization/int8.md | 2 +- docs/features/quantization/quantized_kvcache.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 0c5111fb8af0d..d4a6176b236f1 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -60,7 +60,7 @@ Since simple RTN does not require data for weight quantization and the activatio ??? code ```python - from llmcompressor.transformers import oneshot + from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier # Configure the simple PTQ quantization diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index 035e7ea291f9e..9752039097d63 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -80,7 +80,7 @@ Now, apply the quantization algorithms: ??? code ```python - from llmcompressor.transformers import oneshot + from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index ec8a77f74ffef..701ca6378cb16 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -87,7 +87,7 @@ Now, apply the quantization algorithms: ??? code ```python - from llmcompressor.transformers import oneshot + from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index 56cf057678be6..d26a5e217f314 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -78,7 +78,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models ```python from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer - from llmcompressor.transformers import oneshot + from llmcompressor import oneshot # Select model and load it MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" From 3d4e7d34be856cc4f54033e6a019059afacb5e76 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 19 Nov 2025 05:43:01 +0000 Subject: [PATCH 005/249] [Model][QwenVL] Simplify cos/sin rotary embedding indexing (#28962) Signed-off-by: Lukas Geiger --- vllm/model_executor/models/glm4_1v.py | 9 ++------- vllm/model_executor/models/qwen2_5_vl.py | 9 ++------- vllm/model_executor/models/qwen2_vl.py | 9 ++------- .../models/qwen3_omni_moe_thinker.py | 9 ++------- vllm/model_executor/models/qwen3_vl.py | 17 +++-------------- 5 files changed, 11 insertions(+), 42 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 2c2f45c2453ee..7a4fee76ae6b3 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -797,13 +797,8 @@ class Glm4vVisionTransformer(nn.Module): # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) return cos_combined, sin_combined, pos_ids def compute_attn_mask_seqlen( diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 2e4fd9645d88f..5b5d50ec8935a 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -738,13 +738,8 @@ class Qwen2_5_VisionTransformer(nn.Module): # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) cos_combined = cos_combined.reshape( cos_combined.shape[0] // self.spatial_merge_unit, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 53df5972a8fe1..cda8eaf5377f1 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -724,13 +724,8 @@ class Qwen2VisionTransformer(nn.Module): # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) return cos_combined, sin_combined def compute_attn_mask_seqlen( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 8274b92138f78..d2fd74a5e41ad 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -428,13 +428,8 @@ class Qwen3Omni_VisionTransformer(nn.Module): # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) return cos_combined, sin_combined diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 99a4007ef7f23..0c546309400b7 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -459,18 +459,13 @@ class Qwen3_VisionTransformer(nn.Module): else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1) for t, h, w in grid_thw ] - pos_ids = torch.cat(pos_ids, dim=0) + pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True) # Use pre-computed cos_sin_cache from RotaryEmbedding cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size) - cos_h = cos[pos_ids[:, 0]] # (num_tokens, rotary_dim // 2) - cos_w = cos[pos_ids[:, 1]] - sin_h = sin[pos_ids[:, 0]] - sin_w = sin[pos_ids[:, 1]] - - cos_combined = torch.cat([cos_h, cos_w], dim=-1) - sin_combined = torch.cat([sin_h, sin_w], dim=-1) + cos_combined = cos[pos_ids].flatten(1) + sin_combined = sin[pos_ids].flatten(1) return cos_combined, sin_combined @@ -566,12 +561,6 @@ class Qwen3_VisionTransformer(nn.Module): pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) hidden_states = hidden_states + pos_embeds rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) - rotary_pos_emb_cos = rotary_pos_emb_cos.to( - hidden_states.device, non_blocking=True - ) - rotary_pos_emb_sin = rotary_pos_emb_sin.to( - hidden_states.device, non_blocking=True - ) cu_seqlens = torch.repeat_interleave( grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] From 71d0ae1c54543689ea7541aa20b9522982b0815e Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 19 Nov 2025 09:28:40 +0300 Subject: [PATCH 006/249] [Misc] Update embedding/cross encoder tests to use `mteb` v2 (#27329) Signed-off-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Signed-off-by: wang.yuqi Signed-off-by: wang.yuqi Co-authored-by: Cyrus Leung Co-authored-by: Isotr0py Co-authored-by: wang.yuqi Co-authored-by: wang.yuqi --- requirements/test.in | 2 +- requirements/test.txt | 4 +- .../language/pooling_mteb_test/mteb_utils.py | 181 +++++++++++------- .../test_bge_reranker_v2_gemma.py | 31 ++- .../pooling_mteb_test/test_mxbai_rerank.py | 5 +- .../pooling_mteb_test/test_qwen3_reranker.py | 5 +- 6 files changed, 144 insertions(+), 84 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 30d97e9b9c7d0..05f6bcca5c2c4 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test # TODO: Use lm-eval[api]==0.4.10 once released lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test -mteb[bm25s]>=1.38.11, <2 # required for mteb test +mteb[bm25s]>=2, <3 # required for mteb test transformers==4.57.1 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index 3263b74c08797..bcd511660f85e 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -201,8 +201,6 @@ email-validator==2.2.0 # via pydantic encodec==0.1.1 # via vocos -eval-type-backport==0.2.2 - # via mteb evaluate==0.4.3 # via lm-eval fastapi==0.116.1 @@ -490,7 +488,7 @@ msgpack==1.1.0 # via # librosa # ray -mteb==1.38.11 +mteb==2.1.2 # via -r requirements/test.in multidict==6.1.0 # via diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py index 0384ff82790f0..189cdbae99dcd 100644 --- a/tests/models/language/pooling_mteb_test/mteb_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_utils.py @@ -2,12 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile -from collections.abc import Sequence import mteb import numpy as np import requests import torch +from mteb.models import ModelMeta +from mteb.types import Array +from torch.utils.data import DataLoader import tests.ci_envs as ci_envs from tests.models.utils import ( @@ -27,24 +29,47 @@ MTEB_EMBED_TOL = 1e-4 # See #19344 MTEB_RERANK_TASKS = ["NFCorpus"] -MTEB_RERANK_LANGS = ["en"] +MTEB_RERANK_LANGS = ["eng"] MTEB_RERANK_TOL = 2e-3 +_empty_model_meta = ModelMeta( + loader=None, + name="vllm/model", + revision="1", + release_date=None, + languages=None, + framework=[], + similarity_fn_name=None, + n_parameters=None, + memory_usage_mb=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=None, + public_training_code=None, + public_training_data=None, + use_instructions=None, + training_datasets=None, + modalities=["text"], # 'image' can be added to evaluate multimodal models +) + + +class VllmMtebEncoder(mteb.EncoderProtocol): + mteb_model_meta = _empty_model_meta -class VllmMtebEncoder(mteb.Encoder): def __init__(self, vllm_model): - super().__init__() self.llm = vllm_model self.rng = np.random.default_rng(seed=42) def encode( self, - sentences: Sequence[str], + inputs: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: # Hoping to discover potential scheduling # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] outputs = self.llm.embed(sentences, use_tqdm=False) @@ -52,36 +77,70 @@ class VllmMtebEncoder(mteb.Encoder): embeds = embeds[np.argsort(r)] return embeds + def similarity( + self, + embeddings1: np.ndarray, + embeddings2: np.ndarray, + ) -> np.ndarray: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T) + return sim + + def similarity_pairwise( + self, + embeddings1: Array, + embeddings2: Array, + ) -> Array: + # Cosine similarity + norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True) + norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True) + sim = np.sum(embeddings1 * embeddings2, axis=1) / ( + norm1.flatten() * norm2.flatten() + ) + return sim + + +class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol): + mteb_model_meta = _empty_model_meta + + def __init__(self, vllm_model): + self.llm = vllm_model + self.rng = np.random.default_rng(seed=42) + def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] - - queries = [s[0] for s in sentences] - corpus = [s[1] for s in sentences] + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] outputs = self.llm.score( queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False ) scores = np.array(outputs) - scores = scores[np.argsort(r)] return scores -class OpenAIClientMtebEncoder(mteb.Encoder): +class OpenAIClientMtebEncoder(VllmMtebEncoder): def __init__(self, model_name: str, client): - super().__init__() self.model_name = model_name self.client = client self.rng = np.random.default_rng(seed=42) - def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray: + def encode( + self, + inputs: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: # Hoping to discover potential scheduling # issues by randomizing the order. + sentences = [text for batch in inputs for text in batch["text"]] r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] @@ -94,28 +153,29 @@ class OpenAIClientMtebEncoder(mteb.Encoder): return embeds -class ScoreClientMtebEncoder(mteb.Encoder): +class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol): + mteb_model_meta = _empty_model_meta + def __init__(self, model_name: str, url): - super().__init__() self.model_name = model_name self.url = url self.rng = np.random.default_rng(seed=42) def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - r = self.rng.permutation(len(sentences)) - sentences = [sentences[i] for i in r] + queries = [text for batch in inputs1 for text in batch["text"]] + full_corpus = [text for batch in inputs2 for text in batch["text"]] outputs = [] - for query, corpus, prompt in sentences: + for query, corpus in zip(queries, full_corpus): outputs.append(self.get_score(query, corpus)) scores = np.array(outputs) - scores = scores[np.argsort(r)] return scores def get_score(self, query, corpus): @@ -145,16 +205,13 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder): return response["results"][0]["relevance_score"] -def run_mteb_embed_task(encoder, tasks): +def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks): tasks = mteb.get_tasks(tasks=tasks) - evaluation = mteb.MTEB(tasks=tasks) - results = evaluation.run( + results = mteb.evaluate( encoder, - verbosity=0, - output_folder=None, - encode_kwargs={ - "show_progress_bar": False, - }, + tasks, + cache=None, + show_progress_bar=False, ) main_score = results[0].scores["test"][0]["main_score"] @@ -244,33 +301,39 @@ def mteb_test_embed_models( assert st_main_score - vllm_main_score < atol -def run_mteb_rerank(cross_encoder, tasks, languages): - with tempfile.TemporaryDirectory() as results_folder: +def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): + with tempfile.TemporaryDirectory() as prediction_folder: bm25s = mteb.get_model("bm25s") - tasks = mteb.get_tasks(tasks=tasks, languages=languages) - - subset = "default" eval_splits = ["test"] - evaluation = mteb.MTEB(tasks=tasks) - evaluation.run( - bm25s, - verbosity=0, - eval_splits=eval_splits, - save_predictions=True, - output_folder=f"{results_folder}/stage1", - encode_kwargs={"show_progress_bar": False}, + mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks( + tasks=tasks, languages=languages, eval_splits=eval_splits ) - results = evaluation.run( + mteb.evaluate( + bm25s, + mteb_tasks, + prediction_folder=prediction_folder, + show_progress_bar=False, + # don't save results for test runs + cache=None, + overwrite_strategy="always", + ) + + second_stage_tasks = [] + for task in mteb_tasks: + second_stage_tasks.append( + task.convert_to_reranking( + prediction_folder, + top_k=10, + ) + ) + + results = mteb.evaluate( cross_encoder, - verbosity=0, - eval_splits=eval_splits, - top_k=10, - save_predictions=True, - output_folder=f"{results_folder}/stage2", - previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json", - encode_kwargs={"show_progress_bar": False}, + second_stage_tasks, + show_progress_bar=False, + cache=None, ) main_score = results[0].scores["test"][0]["main_score"] return main_score @@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf( hf_runner, model_name, hf_dtype="float32", hf_model_callback=None ): with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model: - original_predict = hf_model.predict - - def _predict( - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt - *args, - **kwargs, - ): - # vllm and st both remove the prompt, fair comparison. - prompts = [(s[0], s[1]) for s in sentences] - return original_predict(prompts, *args, **kwargs, batch_size=8) - - hf_model.predict = _predict - hf_model.original_predict = original_predict - if hf_model_callback is not None: hf_model_callback(hf_model) @@ -310,7 +359,7 @@ def mteb_test_rerank_models( model_info: RerankModelInfo, vllm_extra_kwargs=None, hf_model_callback=None, - vllm_mteb_encoder=VllmMtebEncoder, + vllm_mteb_encoder=VllmMtebCrossEncoder, atol=MTEB_RERANK_TOL, ): vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 2927a37111364..6b2e469644926 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -2,13 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any +import mteb import numpy as np import pytest import torch +from torch.utils.data import DataLoader from tests.conftest import HfRunner from tests.models.language.pooling_mteb_test.mteb_utils import ( - VllmMtebEncoder, + VllmMtebCrossEncoder, mteb_test_rerank_models, ) from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo @@ -103,7 +105,7 @@ class GemmaRerankerHfRunner(HfRunner): return torch.Tensor(scores) -class GemmaMtebEncoder(VllmMtebEncoder): +class GemmaMtebEncoder(VllmMtebCrossEncoder): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.query_template = "A: {query}\n" @@ -111,17 +113,26 @@ class GemmaMtebEncoder(VllmMtebEncoder): def predict( self, - sentences: list[tuple[str, str, str | None]], # query, corpus, prompt + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], *args, **kwargs, ) -> np.ndarray: - _sentences = [] - for query, corpus, prompt in sentences: - query = self.query_template.format(query=query) - corpus = self.document_template.format(doc=corpus, prompt=PROMPT) - _sentences.append((query, corpus, prompt)) - - return super().predict(_sentences, *args, **kwargs) + queries = [ + self.query_template.format(query=text) + for batch in inputs1 + for text in batch["text"] + ] + corpus = [ + self.document_template.format(doc=text, prompt=PROMPT) + for batch in inputs2 + for text in batch["text"] + ] + outputs = self.llm.score( + queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False + ) + scores = np.array(outputs) + return scores @pytest.mark.parametrize("model_info", RERANK_MODELS) diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index fd04dc1990238..a6f2a89b268f1 100644 --- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -70,8 +70,9 @@ class MxbaiRerankerHfRunner(HfRunner): return scores scores = [] - for prompt in prompts: - inputs = process_inputs([prompt]) + for query, doc, *_ in prompts: + pairs = [(query, doc)] + inputs = process_inputs(pairs) score = compute_logits(inputs) scores.append(score[0].item()) return torch.Tensor(scores) diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index 00e99f44cfdb1..9a1be6c0be1d6 100644 --- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -72,8 +72,9 @@ class Qwen3RerankerHfRunner(HfRunner): return scores scores = [] - for prompt in prompts: - inputs = process_inputs([prompt]) + for query, doc, *_ in prompts: + pairs = [(query, doc)] + inputs = process_inputs(pairs) score = compute_logits(inputs) scores.append(score[0].item()) return torch.Tensor(scores) From a4511e38db375a85b4dd784c2c38528747288f46 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 19 Nov 2025 01:46:32 -0500 Subject: [PATCH 007/249] Speed up macOS smoke test (#28954) Signed-off-by: Michael Goin Signed-off-by: mgoin --- .github/workflows/macos-smoke-test.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 42b05ecd5ac06..a183033c9adde 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -9,7 +9,7 @@ on: jobs: macos-m1-smoke-test: runs-on: macos-latest - timeout-minutes: 20 + timeout-minutes: 30 steps: - uses: actions/checkout@v4 @@ -37,15 +37,14 @@ jobs: - name: Verify installation run: | python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" - python -c "import torch; print(f'PyTorch: {torch.__version__}')" - name: Smoke test vllm serve - timeout-minutes: 10 run: | # Start server in background vllm serve Qwen/Qwen3-0.6B \ - --max-model-len=2048 \ + --max-model-len=2K \ --load-format=dummy \ + --hf-overrides '{"num_hidden_layers": 2}' \ --enforce-eager \ --port 8000 & From 7ed27f3cb55e3f64614300ec7acde1b382a48541 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Wed, 19 Nov 2025 07:52:30 +0100 Subject: [PATCH 008/249] [Doc]: fix typos in various files (#28945) Signed-off-by: Didier Durand --- docs/design/moe_kernel_features.md | 4 ++-- docs/design/plugin_system.md | 2 +- docs/features/quantization/quark.md | 2 +- examples/online_serving/prometheus_grafana/README.md | 2 +- vllm/engine/arg_utils.py | 2 +- vllm/envs.py | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 7663b82266f0b..36ae9506b65fb 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -4,7 +4,7 @@ The purpose of this document is to provide an overview of the various MoE kernel ## Fused MoE Modular All2All backends -There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend. +There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` subclasses provide an interface for each all2all backend. The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support. @@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. ## Fused MoE Experts Kernels -The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. +There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`. diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index dc2f7c4aed3c3..e8db8047ca4e6 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -49,7 +49,7 @@ Every plugin has three parts: - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported. -- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name. +- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre-/post-processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name. - **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase. diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index bd7bc186e13aa..c54d7d2251999 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -306,7 +306,7 @@ As examples, we provide some ready-to-use quantized mixed precision model to sho ### 2. inference the quantized mixed precision model in vLLM -Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow: +Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follows: ```bash lm_eval --model vllm \ diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md index 5cd4dab5a8fa7..9615210a2ad80 100644 --- a/examples/online_serving/prometheus_grafana/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -46,7 +46,7 @@ Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the de Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. -On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`. +On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each container. You can just use `http://prometheus:9090`. Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.". diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ab6e5e594c239..e2f7326448b3a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1500,7 +1500,7 @@ class EngineArgs: # Local DP rank = 1, use pure-external LB. if data_parallel_external_lb: assert self.data_parallel_rank is not None, ( - "data_parallel_rank or node_rank must be spefified if " + "data_parallel_rank or node_rank must be specified if " "data_parallel_external_lb is enable." ) assert self.data_parallel_size_local in (1, None), ( diff --git a/vllm/envs.py b/vllm/envs.py index 6d92d5afee501..e61fb114325c6 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1261,7 +1261,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # MoE routing strategy selector. # See `RoutingSimulator.get_available_strategies()` # for available # strategies. - # Cutstom routing strategies can be registered by + # Custom routing strategies can be registered by # RoutingSimulator.register_strategy() # Note: custom strategies may not produce correct model outputs "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: os.environ.get( From ae4821a1086325decbc801d3292dee42e42549bb Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Tue, 18 Nov 2025 23:47:57 -0800 Subject: [PATCH 009/249] Add CPU support model (#28697) Signed-off-by: Tsai, Louie --- docs/models/hardware_supported_models/cpu.md | 26 ++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 docs/models/hardware_supported_models/cpu.md diff --git a/docs/models/hardware_supported_models/cpu.md b/docs/models/hardware_supported_models/cpu.md new file mode 100644 index 0000000000000..0832755f8fbe2 --- /dev/null +++ b/docs/models/hardware_supported_models/cpu.md @@ -0,0 +1,26 @@ +# CPU - Intel® Xeon® + +## Supported Models + +### Text-only Language Models + +| Model | Architecture | Supported | +|--------------------------------------|-------------------------------------------|-----------| +| meta-llama/Llama-3.1 / 3.3 | LlamaForCausalLM | ✅ | +| meta-llama/Llama-4-Scout | Llama4ForConditionalGeneration | ✅ | +| meta-llama/Llama-4-Maverick | Llama4ForConditionalGeneration | ✅ | +| ibm-granite/granite (Granite-MOE) | GraniteMoeForCausalLM | ✅ | +| Qwen/Qwen3 | Qwen3ForCausalLM | ✅ | +| zai-org/GLM-4.5 | GLMForCausalLM | ✅ | +| google/gemma | GemmaForCausalLM | ✅ | + +### Multimodal Language Models + +| Model | Architecture | Supported | +|--------------------------------------|-------------------------------------------|-----------| +| Qwen/Qwen2.5-VL | Qwen2VLForConditionalGeneration | ✅ | +| openai/whisper | WhisperForConditionalGeneration | ✅ | + +✅ Runs and optimized. +🟨 Runs and correct but not optimized to green yet. +❌ Does not pass accuracy test or does not run. From d69062c67af46a2e624be92162e9db585eef329b Mon Sep 17 00:00:00 2001 From: gnovack Date: Wed, 19 Nov 2025 00:32:00 -0800 Subject: [PATCH 010/249] add support for --fully-sharded-loras in fused_moe (#28761) Signed-off-by: gnovack Co-authored-by: Jee Jee Li --- tests/lora/test_fused_moe_lora_kernel.py | 208 +++++++++++++++++- tests/lora/test_olmoe_tp.py | 10 +- vllm/lora/layers/fused_moe.py | 36 ++- vllm/lora/ops/triton_ops/fused_moe_lora_op.py | 24 +- vllm/lora/punica_wrapper/punica_base.py | 2 + vllm/lora/punica_wrapper/punica_gpu.py | 4 + 6 files changed, 274 insertions(+), 10 deletions(-) diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py index 91ab4a87c65f8..91c8b861c3c5c 100644 --- a/tests/lora/test_fused_moe_lora_kernel.py +++ b/tests/lora/test_fused_moe_lora_kernel.py @@ -1,13 +1,25 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os import random import pytest import torch +from tests.utils import multi_gpu_test from vllm import _custom_ops as ops +from vllm.distributed import ( + init_distributed_environment, + initialize_model_parallel, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_world_size, +) from vllm.lora.ops.triton_ops import fused_moe_lora from vllm.platforms import current_platform +from vllm.utils.network_utils import get_open_port @pytest.fixture(autouse=True) @@ -122,6 +134,8 @@ def use_fused_moe_lora_kernel( max_loras, num_experts, block_size, + fully_sharded=False, + offset=0, ): max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) @@ -195,10 +209,10 @@ def use_fused_moe_lora_kernel( config["NUM_STAGES"], config["SPLIT_K"], mul_routed_weight, + fully_sharded=fully_sharded, + offset=offset, ) - return output - def use_torch( hidden_states, @@ -317,3 +331,193 @@ def test_fused_moe_lora_kernel( ) torch.testing.assert_close(output, output2, atol=1e-1, rtol=1e-1) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize("num_tokens", [100]) +@pytest.mark.parametrize("top_k_num", [6]) +@pytest.mark.parametrize("num_experts", [64]) +@pytest.mark.parametrize("max_loras", [4]) +@pytest.mark.parametrize("N", [1408]) +@pytest.mark.parametrize("K", [2048]) +@pytest.mark.parametrize("max_lora_rank", [16, 32, 64]) +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEED) +@pytest.mark.parametrize("column_parallel", [True, False]) +def test_fused_moe_lora_kernel_fully_sharded( + num_tokens, + top_k_num, + num_experts, + max_loras, + N, + K, + max_lora_rank, + block_size, + dtype, + seed, + column_parallel, +): + current_platform.seed_everything(seed) + # the number of randomly generated sentences. + num_sequences = 10 + # generate data + topk_ids, topk_weights, token_lora_mapping = sample_data( + num_tokens, num_sequences, max_loras, num_experts, top_k_num + ) + + def run_torch_spawn(fn, nprocs): + torch.multiprocessing.spawn( + fn, + args=( + nprocs, + f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}", + dtype, + seed, + N, + K, + num_tokens, + topk_ids, + topk_weights, + token_lora_mapping, + max_lora_rank, + top_k_num, + max_loras, + num_experts, + block_size, + column_parallel, + ), + nprocs=nprocs, + ) + + run_torch_spawn(use_fused_moe_lora_kernel_tensor_parallel, nprocs=2) + + +def use_fused_moe_lora_kernel_tensor_parallel( + local_rank, + world_size, + init_method, + dtype, + seed, + N, + K, + num_tokens, + topk_ids, + topk_weights, + token_lora_mapping, + max_lora_rank, + top_k_num, + max_loras, + num_experts, + block_size, + column_parallel, +): + def _get_shard_slice(shard_size): + return slice(local_rank * shard_size, (local_rank + 1) * shard_size) + + current_platform.seed_everything(seed) + + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + + init_distributed_environment( + world_size=world_size, + rank=local_rank, + local_rank=local_rank, + distributed_init_method=init_method, + ) + initialize_model_parallel(world_size, 1) + tp_size = get_tensor_model_parallel_world_size() + + input_dim = K if column_parallel else N + output_dim = N if column_parallel else K + + # init lora weights + lora_a = torch.rand( + ( + max_loras, + num_experts, + max_lora_rank, + input_dim, + ), + dtype=dtype, + ) + lora_b = torch.rand( + ( + max_loras, + num_experts, + output_dim, + max_lora_rank, + ), + dtype=dtype, + ) + + hidden_states = torch.rand( + ( + num_tokens, + input_dim, + ), + dtype=dtype, + ) + + output = torch.zeros((num_tokens, top_k_num, output_dim), dtype=dtype) + topk_ids = topk_ids.to(device) + topk_weights = topk_weights.to(device) + token_lora_mapping = token_lora_mapping.to(device) + + ref_output = use_torch( + hidden_states, + token_lora_mapping, + topk_ids, + [lora_a], + [lora_b], + top_k_num, + ) + + if column_parallel: + # Column parallel (e.g. gate_up_proj): LoRA A is sliced along the rank dim, + # and Lora B is sliced along the output dim + lora_a_shard_size = max_lora_rank // tp_size + lora_a = lora_a[:, :, _get_shard_slice(lora_a_shard_size), :] + max_lora_rank = lora_a_shard_size + offset = 0 + + lora_b_shard_size = output_dim // tp_size + lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :] + output = output[:, :, _get_shard_slice(lora_b_shard_size)].contiguous() + else: + # Row parallel (e.g. down proj): LoRA A is sliced along the input dim, + # and LoRA B is sliced along the output dim + lora_a_shard_size = input_dim // tp_size + lora_a = lora_a[:, :, :, _get_shard_slice(lora_a_shard_size)] + hidden_states = hidden_states[:, _get_shard_slice(lora_a_shard_size)] + + lora_b_shard_size = output_dim // tp_size + lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :] + offset = lora_b_shard_size * local_rank + + use_fused_moe_lora_kernel( + topk_ids, + topk_weights, + token_lora_mapping, + max_lora_rank, + top_k_num, + [lora_a], + [lora_b], + hidden_states, + output, + max_loras, + num_experts, + block_size, + fully_sharded=True, + offset=offset, + ) + + if column_parallel: + output = tensor_model_parallel_all_gather(output) + else: + output = tensor_model_parallel_all_reduce(output) + + torch.testing.assert_close(output, ref_output, atol=1e-1, rtol=1e-1) diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py index e659c1e1a9a07..e3c9816625ba7 100644 --- a/tests/lora/test_olmoe_tp.py +++ b/tests/lora/test_olmoe_tp.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + import vllm from vllm.lora.request import LoRARequest @@ -111,8 +113,9 @@ def test_olmoe_lora_mixed(olmoe_lora_files): generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None]) +@pytest.mark.parametrize("fully_sharded_loras", [False, True]) @multi_gpu_test(num_gpus=2) -def test_olmoe_lora_tp2(olmoe_lora_files): +def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras): llm = vllm.LLM( MODEL_PATH, max_model_len=1024, @@ -122,14 +125,16 @@ def test_olmoe_lora_tp2(olmoe_lora_files): trust_remote_code=True, enable_chunked_prefill=True, tensor_parallel_size=2, + fully_sharded_loras=fully_sharded_loras, ) generate_and_test(llm, olmoe_lora_files, lora_id=1) generate_and_test(llm, olmoe_lora_files, lora_id=2) +@pytest.mark.parametrize("fully_sharded_loras", [False, True]) @multi_gpu_test(num_gpus=4) -def test_olmoe_lora_tp4(olmoe_lora_files): +def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras): llm = vllm.LLM( MODEL_PATH, max_model_len=1024, @@ -139,6 +144,7 @@ def test_olmoe_lora_tp4(olmoe_lora_files): trust_remote_code=True, enable_chunked_prefill=True, tensor_parallel_size=4, + fully_sharded_loras=fully_sharded_loras, ) generate_and_test(llm, olmoe_lora_files, lora_id=1) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 8fb3efa220f6d..3291c41fcda1e 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -12,6 +12,7 @@ from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, ) +from vllm.distributed.utils import divide from vllm.lora.layers.base import BaseLayerWithLoRA from vllm.lora.ops.triton_ops.utils import get_lora_op_configs from vllm.model_executor.layers.fused_moe import FusedMoE @@ -205,6 +206,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): shrink_config, ## pass the shrink config expand_config, ## pass the expand config self.adapter_enabled, + fully_sharded=self.fully_sharded, ) result = func(*args, **kwargs) @@ -250,7 +252,10 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1) intermediate_cache2 = moe_state_dict["intermediate_cache2"] intermediate_cache3 = args[0] - max_lora_rank = self.w1_lora_a_stacked.shape[-2] + max_lora_rank = self.w2_lora_a_stacked.shape[-2] + + shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size) + self.punica_wrapper.add_lora_fused_moe( intermediate_cache3, intermediate_cache2, @@ -266,6 +271,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): expand_config, ## pass the expand config self.adapter_enabled, True, + fully_sharded=self.fully_sharded, + offset=shard_size_w2 * self.tp_rank if self.fully_sharded else 0, ) result = func(*args, **kwargs) @@ -294,6 +301,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): model_config: PretrainedConfig | None = None, ) -> None: """Initializes lora matrices.""" + self.fully_sharded = lora_config.fully_sharded_loras self.adapter_enabled = torch.tensor( [0] * (max_loras + 1), dtype=torch.int, device=self.device @@ -303,7 +311,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ( max_loras, self.base_layer.local_num_experts, - lora_config.max_lora_rank, + lora_config.max_lora_rank + if not self.fully_sharded + else divide(lora_config.max_lora_rank, self.tp_size), self.base_layer.hidden_size, ), dtype=lora_config.lora_dtype, @@ -334,7 +344,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ( max_loras, self.base_layer.local_num_experts, - self.base_layer.hidden_size, + self.base_layer.hidden_size + if not self.fully_sharded + else divide(self.base_layer.hidden_size, self.tp_size), lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -345,7 +357,9 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ( max_loras, self.base_layer.local_num_experts, - lora_config.max_lora_rank, + lora_config.max_lora_rank + if not self.fully_sharded + else divide(lora_config.max_lora_rank, self.tp_size), self.base_layer.hidden_size, ), dtype=lora_config.lora_dtype, @@ -419,6 +433,20 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): w3_lora_b = w3_lora_b[start_idx:end_idx, :] w2_lora_a = w2_lora_a[:, start_idx:end_idx] + if self.fully_sharded: + # Based on S-LoRA, we slice W1 and W3 A along the rank dim, + # and W2 B along the hidden_size dim. + w13_shard_size = self.w1_lora_a_stacked[index, eid].shape[0] + w13_start_idx = self.tp_rank * w13_shard_size + w13_end_idx = (self.tp_rank + 1) * w13_shard_size + w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :] + w3_lora_a = w3_lora_a[w13_start_idx:w13_end_idx, :] + + w2_shard_size = self.w2_lora_b_stacked[index, eid].shape[0] + w2_start_idx = self.tp_rank * w2_shard_size + w2_end_idx = (self.tp_rank + 1) * w2_shard_size + w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :] + self.w1_lora_a_stacked[ index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1] ].copy_(w1_lora_a, non_blocking=True) diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index e2dd47dbb4e64..413ee8ecbbf96 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -3,6 +3,10 @@ import torch +from vllm.distributed import ( + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) from vllm.triton_utils import tl, triton from vllm.utils.torch_utils import direct_register_custom_op @@ -311,6 +315,7 @@ def _fused_moe_lora_expand( num_stages: int, split_k: int, mul_routed_weight: bool = False, + offset: int = 0, ) -> None: b_ptr = _get_ptr(lora_b_stacked, device) K = max_lora_rank @@ -380,7 +385,7 @@ def _fused_moe_lora_expand( **expand_config, ) for i in range(num_slices): - output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i] + output[:, :, i * N + offset : (i + 1) * N + offset] += b_intermediate_cache1[i] @torch.inference_mode() @@ -416,6 +421,8 @@ def _fused_moe_lora( expand_num_stages: int, expand_split_k: int, mul_routed_weight: bool = False, + fully_sharded: bool = False, + offset: int = 0, ) -> None: assert len(lora_a_stacked) == len(lora_b_stacked) > 0 assert ( @@ -430,7 +437,6 @@ def _fused_moe_lora( == expert_ids.shape[0] == num_tokens_post_padded.shape[0] ) - assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1] assert output.shape[0] == topk_weights.shape[0] assert top_k_num == topk_weights.shape[1] device = qcurr_hidden_states.device @@ -480,6 +486,19 @@ def _fused_moe_lora( mul_routed_weight, ) + if fully_sharded: + if max_lora_rank == w1_lora_b_stacked.shape[-1]: + a_intermediate_cache1 = tensor_model_parallel_all_reduce( + a_intermediate_cache1 + ) + else: + a_intermediate_cache1 = tensor_model_parallel_all_gather( + a_intermediate_cache1 + ) + + # reset max_lora_rank to the full rank after allgather + max_lora_rank = a_intermediate_cache1.shape[-1] + _fused_moe_lora_expand( output, a_intermediate_cache1, @@ -510,6 +529,7 @@ def _fused_moe_lora( expand_num_stages, expand_split_k, mul_routed_weight, + offset, ) diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index b6186e8561529..a6ffbb7b71ce4 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -483,6 +483,8 @@ class PunicaWrapperBase(PunicaWrapperABC): expand_config, adapter_enabled: torch.Tensor, mul_routed_weight=False, + fully_sharded: bool = False, + offset: int = 0, ): """ Performs a fused forward computation for LoRA of diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index ede50a48af985..d863a5884d3c5 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -375,6 +375,8 @@ class PunicaWrapperGPU(PunicaWrapperBase): expand_config, adapter_enabled: torch.Tensor, mul_routed_weight=False, + fully_sharded: bool = False, + offset: int = 0, ): """ Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer. @@ -408,4 +410,6 @@ class PunicaWrapperGPU(PunicaWrapperBase): expand_config.get("NUM_STAGES", 3), expand_config.get("SPLIT_K", 1), mul_routed_weight, + fully_sharded, + offset, ) From fdf93486d6c4f36be2f410a846bf68654041dc51 Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Wed, 19 Nov 2025 18:35:29 +0800 Subject: [PATCH 011/249] [Docs] Clean up moe_kernel_features.md (#28530) Signed-off-by: windsonsea --- docs/design/moe_kernel_features.md | 90 +++++++++++++++--------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 36ae9506b65fb..f0d5a3e934f39 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -1,4 +1,4 @@ -# Fused MoE Kernel features +# Fused MoE Kernel Features The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels. @@ -8,15 +8,15 @@ There are a number of all2all communication backends that are used to implement The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support. -The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document. +The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, and the finalize step requires the same format. All the backend `prepare` methods expect activations in the standard format and all the `finalize` methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document. -The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16. +The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports, e.g. deepep_high_throughput supports only block-quantized fp8 format. Any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 with per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16. Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step). -Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag. +Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag. -unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP. +Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP. -| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Sub-class | -|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| -| naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] | -| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] | -| deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] | -| deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] | -| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] | -| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | -| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | -| MoEPrepareAndFinalizeNoEP5 | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] | -| BatchedPrepareAndFinalize5 | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] | +| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass | +|---------|--------------------|--------------|---------------|-------|-----------------------|-----------| +| naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] | +| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] | +| deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] | +| deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] | +| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] | +| flashinfer4 | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] | +| MoEPrepareAndFinalizeNoEP5 | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] | +| BatchedPrepareAndFinalize5 | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] | !!! info "Table key" 1. All types: mxfp4, nvfp4, int4, int8, fp8 2. A,T quantization occurs after dispatch. 3. All quantization happens after dispatch. 4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency") - 5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API. + 5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs without dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API. 6. This depends on the experts implementation. --- @@ -66,44 +65,43 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod] - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod] -## Fused MoE Experts Kernels +## Fused Experts Kernels -There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. +There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. -Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`. +Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`. Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type. -Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results. +Each experts kernel supports one or more activation functions, e.g. silu or gelu, which are applied to the intermediate results. As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts. Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`. -To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats. +To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats. -| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source | -|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | -| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | -| deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] | -| cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],
[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | -| cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],
[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | -| flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],
[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | -| gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | -| deep gemm+triton2 | standard,
batched | all1 | G(128),A,T | silu, gelu | 6 | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],
[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] | -| marlin | standard | 3 | 3 | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | -| marlin experts | standard,
batched | N/A | N/A | silu,
swigluoai | Y | Y | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | -| trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | -| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] | -| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] | -| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] | -| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | -| naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] | +| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source | +|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------| +| triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | +| triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | +| deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] | +| cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],
[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | +| cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],
[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | +| flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],
[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | +| gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | +| deep gemm+triton2 | standard,
batched | all1 | G(128),A,T | silu, gelu | 6 | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],
[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] | +| marlin | standard,
batched | 3 / N/A | 3 / N/A | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | +| trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | +| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] | +| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] | +| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] | +| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | +| naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] | !!! info "Table key" 1. All types: mxfp4, nvfp4, int4, int8, fp8 - 2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params + 2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params 3. uint4, uint8, fp8, fp4 4. This is a naive implementation of experts that supports batched format. Mainly used for testing. 5. The `activation` parameter is ignored and SwiGlu is used by default instead. @@ -113,8 +111,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts. -| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses | -|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------| -| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,
`TritonExperts`,
`TritonOrDeepGemmExperts`,
`CutlassExpertsFp8`,
`MarlinExperts` | -| deepep_low_latency,
pplx | `DeepEPLLPrepareAndFinalize`,
`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,
`BatchedTritonExperts`,
`BatchedTritonOrDeepGemmExperts`,
`CutlassBatchedExpertsFp8`,
`BatchedMarlinExperts`| -| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` | +| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses | +|---------|-----------------------------------------|----------------------------------------------| +| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,
`TritonExperts`,
`TritonOrDeepGemmExperts`,
`CutlassExpertsFp8`,
`MarlinExperts` | +| deepep_low_latency,
pplx | `DeepEPLLPrepareAndFinalize`,
`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,
`BatchedTritonExperts`,
`BatchedTritonOrDeepGemmExperts`,
`CutlassBatchedExpertsFp8`,
`BatchedMarlinExperts` | +| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` | From 815160958327d601933139b9e76a01eb6d2bc5cf Mon Sep 17 00:00:00 2001 From: ihb2032 <40718643+ihb2032@users.noreply.github.com> Date: Wed, 19 Nov 2025 19:05:44 +0800 Subject: [PATCH 012/249] refactor(cpu_types_scalar.hpp): Unify scalar loop implementations using unroll_loop (#28847) Signed-off-by: ihb2032 <1355790728@qq.com> Co-authored-by: lyd1992 --- csrc/cpu/cpu_types_scalar.hpp | 222 +++++++++++++--------------------- 1 file changed, 87 insertions(+), 135 deletions(-) diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp index 1a9278bc662e5..f9da78283da5e 100644 --- a/csrc/cpu/cpu_types_scalar.hpp +++ b/csrc/cpu/cpu_types_scalar.hpp @@ -26,10 +26,6 @@ namespace vec_op { #define FORCE_INLINE __attribute__((always_inline)) inline -#define __max(a, b) ((a) > (b) ? (a) : (b)) -#define __min(a, b) ((a) < (b) ? (a) : (b)) -#define __abs(a) ((a) < (0) ? (0 - a) : (a)) - typedef struct f16x8_t { uint16_t val[8]; } f16x8_t; @@ -99,7 +95,7 @@ struct FP16Vec16 : public Vec { void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } void save(void* ptr, const int elem_num) const { - int num = __min(elem_num, VEC_ELEM_NUM); + int num = std::min(elem_num, VEC_ELEM_NUM); std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t)); } }; @@ -128,7 +124,7 @@ struct BF16Vec16 : public Vec { void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } void save(void* ptr, const int elem_num) const { - int num = __min(elem_num, VEC_ELEM_NUM); + int num = std::min(elem_num, VEC_ELEM_NUM); std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t)); } }; @@ -143,9 +139,9 @@ struct BF16Vec32 : public Vec { explicit BF16Vec32(f16x32_t data) : reg(data) {}; explicit BF16Vec32(BF16Vec8& vec8_data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&vec8_data, this](int i) { reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM]; - } + }); } void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } @@ -157,15 +153,11 @@ struct FP32Vec4 : public Vec { f32x4_t reg; explicit FP32Vec4(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec4() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec4(const float* ptr) @@ -182,15 +174,11 @@ struct FP32Vec8 : public Vec { f32x8_t reg; explicit FP32Vec8(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec8() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec8(const float* ptr) @@ -201,78 +189,68 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; explicit FP32Vec8(const FP16Vec8& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = fp16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); }); } FP32Vec8(const BF16Vec8& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = bf16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); }); } float reduce_sum() const { float result = 0; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result += reg.val[i]; - } + unroll_loop( + [&result, this](int i) { result += reg.val[i]; }); return result; } FP32Vec8 exp() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = expf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = expf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 tanh() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = tanhf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = tanhf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 er() const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = erf(reg.val[i]); - } + unroll_loop( + [&ret, this](int i) { ret.val[i] = erf(reg.val[i]); }); return FP32Vec8(ret); } FP32Vec8 operator*(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] * b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator+(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] + b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator-(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] - b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; }); return FP32Vec8(ret); } FP32Vec8 operator/(const FP32Vec8& b) const { f32x8_t ret; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - ret.val[i] = reg.val[i] / b.reg.val[i]; - } + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; }); return FP32Vec8(ret); } @@ -284,15 +262,11 @@ struct FP32Vec16 : public Vec { f32x16_t reg; explicit FP32Vec16(float v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = v; - } + unroll_loop([&v, this](int i) { reg.val[i] = v; }); } explicit FP32Vec16() { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = 0.0f; - } + unroll_loop([this](int i) { reg.val[i] = 0.0f; }); } explicit FP32Vec16(const float* ptr) @@ -301,29 +275,27 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(f32x16_t data) : reg(data) {}; FP32Vec16(const FP32Vec4& data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&data, this](int i) { reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM]; - } + }); } FP32Vec16(const FP32Vec8& data) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { + unroll_loop([&data, this](int i) { reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM]; - } + }); } FP32Vec16(const FP32Vec16& data) : reg(data.reg) {}; explicit FP32Vec16(const FP16Vec16& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = fp16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); }); } explicit FP32Vec16(const BF16Vec16& v) { - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - reg.val[i] = bf16_to_float(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); }); } explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; @@ -331,82 +303,74 @@ struct FP32Vec16 : public Vec { FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; FP32Vec16 operator*(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] * b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator+(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] + b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator-(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] - b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 operator/(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = reg.val[i] / b.reg.val[i]; - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; }); + return FP32Vec16(ret); } FP32Vec16 max(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __max(reg.val[i], b.reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop([&ret, &b, this](int i) { + ret.val[i] = std::max(reg.val[i], b.reg.val[i]); + }); + return FP32Vec16(ret); } FP32Vec16 min(const FP32Vec16& b) const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __min(reg.val[i], b.reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop([&ret, &b, this](int i) { + ret.val[i] = std::min(reg.val[i], b.reg.val[i]); + }); + return FP32Vec16(ret); } FP32Vec16 abs() const { - FP32Vec16 result(0.0f); - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result.reg.val[i] = __abs(reg.val[i]); - } - return result; + f32x16_t ret; + unroll_loop( + [&ret, this](int i) { ret.val[i] = std::abs(reg.val[i]); }); + return FP32Vec16(ret); } float reduce_sum() const { float result = 0.0f; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result += reg.val[i]; - } + unroll_loop( + [&result, this](int i) { result += reg.val[i]; }); return result; } float reduce_max() const { - float result = reg.val[0]; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result = __max(reg.val[i], result); - } + float result = std::numeric_limits::lowest(); + unroll_loop( + [&result, this](int i) { result = std::max(reg.val[i], result); }); return result; } float reduce_min() const { - float result = reg.val[0]; - for (int i = 0; i < VEC_ELEM_NUM; ++i) { - result = __min(reg.val[i], result); - } + float result = std::numeric_limits::max(); + unroll_loop( + [&result, this](int i) { result = std::min(reg.val[i], result); }); return result; } @@ -414,13 +378,9 @@ struct FP32Vec16 : public Vec { float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); float sum = 0.0; - int start = idx * group_size; - int end = (idx + 1) * group_size; - - for (; (start < VEC_ELEM_NUM) && (start < end); ++start) { - sum += reg.val[start]; - } - + const int start = idx * group_size; + unroll_loop( + [&sum, &start, this](int i) { sum += reg.val[start + i]; }); return sum; } @@ -477,17 +437,13 @@ inline void storeFP32(float v, c10::BFloat16* ptr) { } inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { - int i = 0; - for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_fp16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); }); } inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { - int i = 0; - for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_fp16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); }); } inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { @@ -495,17 +451,13 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { } inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { - int i = 0; - for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_bf16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); }); } inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { - int i = 0; - for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) { - reg.val[i] = float_to_bf16(v.reg.val[i]); - } + unroll_loop( + [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); }); } inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); } From bbc6c2f1e5bc856a9265dfa2b379ed1d242adc33 Mon Sep 17 00:00:00 2001 From: j20120307 Date: Wed, 19 Nov 2025 03:07:22 -0800 Subject: [PATCH 013/249] [CI/Build] Fix broken build on Apple M1 (#28999) Signed-off-by: Kan Zhu --- csrc/cpu/utils.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp index d8399c56f6af8..d3def306b8069 100644 --- a/csrc/cpu/utils.hpp +++ b/csrc/cpu/utils.hpp @@ -6,6 +6,10 @@ #include #include +#if defined(__APPLE__) + #include +#endif + #include "cpu_types.hpp" namespace cpu_utils { @@ -21,10 +25,12 @@ struct VecTypeTrait { using vec_t = vec_op::FP32Vec16; }; +#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT) template <> struct VecTypeTrait { using vec_t = vec_op::BF16Vec16; }; +#endif template <> struct VecTypeTrait { @@ -44,9 +50,21 @@ struct Counter { inline int64_t get_l2_size() { static int64_t size = []() { +#if defined(__APPLE__) + // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname. + int64_t l2_cache_size = 0; + size_t len = sizeof(l2_cache_size); + if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 && + l2_cache_size > 0) { + return l2_cache_size >> 1; // use 50% of L2 cache + } + // Fallback if sysctlbyname fails + return 128LL * 1024 >> 1; // use 50% of 128KB +#else long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); assert(l2_cache_size != -1); return l2_cache_size >> 1; // use 50% of L2 cache +#endif }(); return size; } From 97cfa99d59375de6d5e4c17dc6aea955ae75b493 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 19 Nov 2025 12:32:04 +0100 Subject: [PATCH 014/249] [Docs] Take env var definition out of folded admonition (#29005) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/configuration/env_vars.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index 2c0a898754fa0..f6d548a19d91f 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -7,8 +7,6 @@ vLLM uses the following environment variables to configure the system: All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). -??? code - - ```python - --8<-- "vllm/envs.py:env-vars-definition" - ``` +```python +--8<-- "vllm/envs.py:env-vars-definition" +``` From ba558c029ad65ab4f040c8320607ebd87612cf08 Mon Sep 17 00:00:00 2001 From: Tova Movshovitz Date: Wed, 19 Nov 2025 13:37:11 +0200 Subject: [PATCH 015/249] [config] Expose `get_total_num_hidden_layers()` in ModelConfig (#28961) Signed-off-by: tovam Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/config/model.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 3e8790a26e0e3..f61dbb6a695a2 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1369,11 +1369,7 @@ class ModelConfig: # Coerce to 0 if explicitly set to None return num_experts or 0 - def get_layers_start_end_indices( - self, parallel_config: ParallelConfig - ) -> tuple[int, int]: - from vllm.distributed.utils import get_pp_indices - + def get_total_num_hidden_layers(self) -> int: if ( self.hf_text_config.model_type == "deepseek_mtp" or self.hf_config.model_type == "mimo_mtp" @@ -1393,6 +1389,15 @@ class ModelConfig: total_num_hidden_layers = getattr( self.hf_text_config, "num_hidden_layers", 0 ) + return total_num_hidden_layers + + def get_layers_start_end_indices( + self, parallel_config: ParallelConfig + ) -> tuple[int, int]: + from vllm.distributed.utils import get_pp_indices + + total_num_hidden_layers = self.get_total_num_hidden_layers() + # the layout order is: DP x PP x TP pp_rank = ( parallel_config.rank // parallel_config.tensor_parallel_size From da2f6800e0d6ac768c6f63b95f7c0755407f4263 Mon Sep 17 00:00:00 2001 From: Chen Bruce Date: Wed, 19 Nov 2025 20:46:24 +0800 Subject: [PATCH 016/249] [Feat][Perf] Enable deepep-low-latency with round-robin expert placement. (#28449) Signed-off-by: bruceszchen Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../layers/fused_moe/all2all_utils.py | 11 ++ .../fused_moe/deepep_ll_prepare_finalize.py | 30 +++- .../layers/fused_moe/fused_moe_method_base.py | 9 +- vllm/model_executor/layers/fused_moe/layer.py | 157 +++++++++++++++--- .../fused_moe/unquantized_fused_moe_method.py | 7 +- .../compressed_tensors_moe.py | 14 +- .../model_executor/layers/quantization/fp8.py | 7 +- .../layers/quantization/modelopt.py | 10 +- 8 files changed, 208 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py index 2dd625054339c..86c50f39f0076 100644 --- a/vllm/model_executor/layers/fused_moe/all2all_utils.py +++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py @@ -67,6 +67,7 @@ def maybe_roundup_layer_hidden_size( def maybe_make_prepare_finalize( moe: FusedMoEConfig, quant_config: FusedMoEQuantConfig | None, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> FusedMoEPrepareAndFinalize | None: if not moe.moe_parallel_config.use_all2all_kernels: return None @@ -134,6 +135,13 @@ def maybe_make_prepare_finalize( elif moe.use_deepep_ll_kernels: assert quant_config is not None + global_to_physical = physical_to_global = local_expert_global_ids = None + if routing_tables is not None: + ( + global_to_physical, + physical_to_global, + local_expert_global_ids, + ) = routing_tables all_to_all_args = dict( max_num_tokens_per_dp_rank=moe.max_num_tokens, token_hidden_size=moe.hidden_dim, @@ -155,6 +163,9 @@ def maybe_make_prepare_finalize( max_tokens_per_rank=moe.max_num_tokens, num_dispatchers=all2all_manager.world_size, use_fp8_dispatch=use_fp8_dispatch, + global_to_physical=global_to_physical, + physical_to_global=physical_to_global, + local_expert_global_ids=local_expert_global_ids, ) return prepare_finalize diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index 06c9df317f7c7..e0db248958b47 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -85,6 +85,9 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): max_tokens_per_rank: int, num_dispatchers: int, use_fp8_dispatch: bool = False, + global_to_physical: torch.Tensor | None = None, + physical_to_global: torch.Tensor | None = None, + local_expert_global_ids: torch.Tensor | None = None, ): super().__init__() @@ -97,6 +100,17 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self.handles: list[tuple | None] = [None, None] self.num_dispatchers_ = num_dispatchers + topk_indices_dtype = self.topk_indices_dtype() + + def _maybe_cast(tensor: torch.Tensor | None) -> torch.Tensor | None: + if tensor is None or topk_indices_dtype is None: + return tensor + return tensor.to(dtype=topk_indices_dtype) + + self.global_to_physical = _maybe_cast(global_to_physical) + self.physical_to_global = _maybe_cast(physical_to_global) + self.local_expert_global_ids = _maybe_cast(local_expert_global_ids) + # We don't have enough information to determine if we should dispatch # activation scales in a packed ue8m0 format during object construction # time. This setting is handled by post_init_setup. @@ -136,6 +150,16 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): def topk_indices_dtype(self) -> torch.dtype | None: return torch.int64 + def _map_global_to_physical_ids(self, topk_ids: torch.Tensor) -> torch.Tensor: + if self.global_to_physical is None: + return topk_ids + return self.global_to_physical[topk_ids] + + def _map_local_to_global_ids(self, expert_topk_ids: torch.Tensor) -> torch.Tensor: + if self.local_expert_global_ids is None: + return expert_topk_ids + return self.local_expert_global_ids[expert_topk_ids] + def _do_quant( self, x: torch.Tensor | tuple[torch.Tensor, torch.Tensor], @@ -226,9 +250,10 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): a1 = a1 * topk_weights.to(a1.dtype) # Dispatch + dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids) expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch( a1, - topk_ids, + dispatch_topk_ids, self.max_tokens_per_rank, num_experts, use_fp8=self.use_fp8_dispatch, @@ -313,11 +338,12 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # weights have already been applied. combine_topk_weights = torch.ones_like(topk_weights) + combine_topk_ids = self._map_global_to_physical_ids(topk_ids) # TODO (varun) : Enable zero copy mode dbo_maybe_run_recv_hook() _, _, recv_hook = self.buffer.low_latency_combine( fused_expert_output, - topk_ids, + combine_topk_ids, combine_topk_weights, handle, async_finish=False, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 87f8c8d75a9b5..073e90a4e6808 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -50,10 +50,15 @@ class FusedMoEMethodBase(QuantizeMethodBase): """ return False - def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> FusedMoEPrepareAndFinalize | None: from .all2all_utils import maybe_make_prepare_finalize - return maybe_make_prepare_finalize(self.moe, self.moe_quant_config) + return maybe_make_prepare_finalize( + self.moe, self.moe_quant_config, routing_tables + ) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 023132acfed3f..c41995e4a9136 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -5,7 +5,7 @@ from collections.abc import Callable, Iterable from contextlib import nullcontext from enum import Enum from functools import partial -from typing import Literal, get_args, overload +from typing import Literal, cast, get_args, overload import torch import torch.nn.functional as F @@ -192,6 +192,42 @@ def determine_expert_map( return (local_num_experts, expert_map, expert_mask) +def determine_expert_placement_strategy( + expert_placement_strategy: ExpertPlacementStrategy, + moe_parallel_config: FusedMoEParallelConfig, + num_expert_group: int | None, + num_redundant_experts: int, + enable_eplb: bool, +) -> ExpertPlacementStrategy: + if expert_placement_strategy == "round_robin": + round_robin_supported = ( + (num_expert_group is not None and num_expert_group > 1) + and num_redundant_experts == 0 + and not enable_eplb + ) + + if not round_robin_supported: + logger.warning( + "Round-robin expert placement is only supported for " + "models with multiple expert groups and no redundant " + "experts. Falling back to linear expert placement." + ) + return "linear" + if ( + moe_parallel_config.use_all2all_kernels + and not moe_parallel_config.use_deepep_ll_kernels + ): + logger.warning( + "Round-robin expert placement currently only supports " + "the DeepEP low-latency backend, but '%s' was configured. " + "Falling back to linear expert placement.", + moe_parallel_config.all2all_backend, + ) + return "linear" + + return expert_placement_strategy + + def get_compressed_expert_map(expert_map: torch.Tensor) -> str: """ Compresses the expert map by removing any -1 entries. @@ -400,6 +436,9 @@ class FusedMoE(CustomOp): self.expert_load_view: torch.Tensor | None = None self.logical_to_physical_map: torch.Tensor | None = None self.logical_replica_count: torch.Tensor | None = None + self.expert_placement_strategy: ExpertPlacementStrategy = ( + vllm_config.parallel_config.expert_placement_strategy + ) # ROCm aiter shared experts fusion self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled() @@ -433,38 +472,27 @@ class FusedMoE(CustomOp): "Redundant experts are only supported with EPLB." ) - expert_placement_strategy = ( - vllm_config.parallel_config.expert_placement_strategy + self.expert_placement_strategy = determine_expert_placement_strategy( + expert_placement_strategy=self.expert_placement_strategy, + moe_parallel_config=self.moe_parallel_config, + num_expert_group=num_expert_group, + num_redundant_experts=num_redundant_experts, + enable_eplb=self.enable_eplb, ) - if expert_placement_strategy == "round_robin": - # TODO(Bruce): will support round robin expert placement with - # EPLB enabled in the future. - round_robin_supported = ( - (num_expert_group is not None and num_expert_group > 1) - and num_redundant_experts == 0 - and not self.enable_eplb - ) - - if not round_robin_supported: - logger.warning( - "Round-robin expert placement is only supported for " - "models with multiple expert groups and no redundant " - "experts. Falling back to linear expert placement." - ) - expert_placement_strategy = "linear" self.expert_map: torch.Tensor | None local_num_experts, expert_map, expert_mask = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts, - expert_placement_strategy=expert_placement_strategy, + expert_placement_strategy=self.expert_placement_strategy, num_fused_shared_experts=self.num_fused_shared_experts, return_expert_mask=self.rocm_aiter_fmoe_enabled, ) self.local_num_experts = local_num_experts self.register_buffer("expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) + self._maybe_init_expert_routing_tables() logger.info_once( "[EP Rank %s/%s] Expert parallelism is enabled. Expert " "placement strategy: %s. Local/global" @@ -472,7 +500,7 @@ class FusedMoE(CustomOp): " %s.", self.ep_rank, self.ep_size, - expert_placement_strategy, + self.expert_placement_strategy, self.local_num_experts, self.global_num_experts, get_compressed_expert_map(self.expert_map), @@ -621,7 +649,12 @@ class FusedMoE(CustomOp): # should be safe to swap out the quant_method. def maybe_init_modular_kernel(self) -> None: self.ensure_moe_quant_config_init() - prepare_finalize = self.quant_method.maybe_make_prepare_finalize() + # routing_tables only needed for round-robin expert placement with + # DeepEP all2all backend. + routing_tables = self._maybe_init_expert_routing_tables() + prepare_finalize = self.quant_method.maybe_make_prepare_finalize( + routing_tables=routing_tables + ) if prepare_finalize is not None: logger.debug( "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self) @@ -703,6 +736,84 @@ class FusedMoE(CustomOp): # By default, router/gate is called before FusedMoE forward pass return False + def _maybe_init_expert_routing_tables( + self, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None: + # Currently routing_tables only needed for round-robin expert placement + # with DeepEP-ll all2all backend. + if ( + self.expert_placement_strategy != "round_robin" + or not self.use_deepep_ll_kernels + ): + return None + + if hasattr(self, "expert_global_to_physical"): + return cast( + tuple[torch.Tensor, torch.Tensor, torch.Tensor], + ( + self.expert_global_to_physical, + self.expert_physical_to_global, + self.expert_local_to_global, + ), + ) + + if self.expert_map is None: + return None + + routing_tables = self.ensure_round_robin_expert_routing_tables( + global_num_experts=self.global_num_experts, + ep_size=self.ep_size, + ep_rank=self.ep_rank, + local_num_experts=self.local_num_experts, + device=self.expert_map.device, + ) + + global_to_physical, physical_to_global, local_global = routing_tables + self.register_buffer("expert_global_to_physical", global_to_physical) + self.register_buffer("expert_physical_to_global", physical_to_global) + self.register_buffer("expert_local_to_global", local_global) + + return routing_tables + + @staticmethod + def ensure_round_robin_expert_routing_tables( + global_num_experts: int, + ep_size: int, + ep_rank: int, + local_num_experts: int, + device: torch.device | None = None, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + device_kwargs = {"device": device} if device is not None else {} + global_indices = torch.arange( + global_num_experts, dtype=torch.long, **device_kwargs + ) + owner = torch.remainder(global_indices, ep_size) + local_index = torch.div(global_indices, ep_size, rounding_mode="floor") + base = global_num_experts // ep_size + remainder = global_num_experts % ep_size + physical_offset = owner * base + if remainder > 0: + remainder_tensor = torch.tensor( + remainder, dtype=torch.long, **device_kwargs + ) + physical_offset = physical_offset + torch.minimum(owner, remainder_tensor) + + global_to_physical = physical_offset + local_index + physical_to_global = torch.empty_like(global_to_physical) + physical_to_global[global_to_physical] = global_indices + + local_global = torch.arange( + ep_rank, + global_num_experts, + ep_size, + dtype=torch.long, + **device_kwargs, + ) + if local_global.numel() != local_num_experts: + local_global = local_global[:local_num_experts] + + return (global_to_physical, physical_to_global, local_global) + def update_expert_map(self): # ep_size and ep_rank should already be updated assert self.expert_map is not None @@ -711,12 +822,14 @@ class FusedMoE(CustomOp): ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts, + expert_placement_strategy=self.expert_placement_strategy, num_fused_shared_experts=self.num_fused_shared_experts, return_expert_mask=self.rocm_aiter_fmoe_enabled, ) self.local_num_experts = local_num_experts self.register_buffer("expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) + self._maybe_init_expert_routing_tables() if self.aiter_fmoe_shared_expert_enabled: self._init_aiter_shared_experts_topK_buffer( vllm_config=get_current_vllm_config(), diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 2e0376553b913..63b0e6f573d65 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -108,11 +108,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def allow_inplace(self) -> bool: return True - def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> FusedMoEPrepareAndFinalize | None: if self.rocm_aiter_moe_enabled: return None else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 06ee96d55419c..22b3c477f420f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -380,11 +380,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): (layer.w2_input_global_scale), requires_grad=False ) - def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: if self.use_marlin: return None elif not self.allow_flashinfer: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe) logger.debug_once("%s", prepare_finalize.__class__.__name__) @@ -890,11 +893,14 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): layer.w2_weight_scale ) - def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: if self.use_marlin or self.rocm_aiter_moe_enabled: return None else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 0479bec338408..92fbdd7093483 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1018,7 +1018,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w13_input_scale del layer.w2_input_scale - def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: if ( self.rocm_aiter_moe_enabled or self.use_marlin @@ -1039,7 +1042,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 476521813f464..38ab7cd4f115c 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -373,6 +373,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): def maybe_make_prepare_finalize( self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> mk.FusedMoEPrepareAndFinalize | None: # TRT LLM not supported with all2all yet. if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: @@ -384,7 +385,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, @@ -1179,7 +1180,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): " for ModelOptNvFp4FusedMoE." ) - def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None: + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: if self.use_marlin or ( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM @@ -1196,7 +1200,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize else: - return super().maybe_make_prepare_finalize() + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, From 09540cd918a5f7d776d7f7e0abec78fbc03938ad Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:56:21 +0100 Subject: [PATCH 017/249] [Doc]: fix typos in various files (#29010) Signed-off-by: Didier Durand --- docs/deployment/frameworks/skypilot.md | 2 +- docs/design/prefix_caching.md | 2 +- docs/features/nixl_connector_usage.md | 2 +- docs/getting_started/quickstart.md | 2 +- tests/v1/ec_connector/integration/README.md | 2 +- vllm/multimodal/evs.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md index f4a984a6433e2..e9b0d5f0671c3 100644 --- a/docs/deployment/frameworks/skypilot.md +++ b/docs/deployment/frameworks/skypilot.md @@ -4,7 +4,7 @@ vLLM

-vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). +vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc., can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). ## Prerequisites diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index bd4070f381d81..48536a877bd3f 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -1,6 +1,6 @@ # Automatic Prefix Caching -Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang). +Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc.) and most open source LLM inference frameworks (e.g., SGLang). While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block: diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md index 1ce038f4d6525..f0e25e31aa0b3 100644 --- a/docs/features/nixl_connector_usage.md +++ b/docs/features/nixl_connector_usage.md @@ -158,7 +158,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \ ## Experimental Feature -### Heterogenuous KV Layout support +### Heterogeneous KV Layout support Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index cfc8b4d9838a7..9e86f785b10c7 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -286,7 +286,7 @@ If desired, you can also manually set the backend of your choice by configuring - On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`. - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`. -For AMD ROCm, you can futher control the specific Attention implementation using the following variables: +For AMD ROCm, you can further control the specific Attention implementation using the following variables: - Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0` - AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0` diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md index 30426e055ade8..2dbcb307fda32 100644 --- a/tests/v1/ec_connector/integration/README.md +++ b/tests/v1/ec_connector/integration/README.md @@ -113,7 +113,7 @@ Quick sanity check: - Outputs differ between baseline and disagg - Server startup fails -- Encoder cache not found (should fallback to local execution) +- Encoder cache not found (should fall back to local execution) - Proxy routing errors ## Notes diff --git a/vllm/multimodal/evs.py b/vllm/multimodal/evs.py index 4a288d2d238c2..8a36ea415da4d 100644 --- a/vllm/multimodal/evs.py +++ b/vllm/multimodal/evs.py @@ -185,7 +185,7 @@ def recompute_mrope_positions( Args: input_ids: (N,) All input tokens of the prompt (entire sequence). - multimodal_positions: List of mrope positsions for each media. + multimodal_positions: List of mrope positions for each media. mrope_positions: Existing mrope positions (4, N) for entire sequence. num_computed_tokens: A number of computed tokens so far. vision_start_token_id: Token indicating start of vision media. From 4f5299f7174ffb10bdc640b47d3494083fc39c48 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:50:30 +0100 Subject: [PATCH 018/249] Relax Transformers modeling backend MoE experts check (#28952) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 4 +++- vllm/model_executor/models/transformers/moe.py | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index bd14bbb9ab662..80fe143269a76 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -79,7 +79,9 @@ To make your model compatible with the Transformers modeling backend, it needs: 1. Add `is_causal = False` to `MyAttention`. - If your model is mixture-of-experts (MoE): 1. Your sparse MoE block must have an attribute called `experts`. - 2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`. + 2. The class of `experts` (`MyExperts`) must either: + - Inherit from `nn.ModuleList` (naive). + - Or contain all 3D `nn.Parameters` (packed). 3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`. 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention. 3. `MyModel` must contain `_supports_attention_backend = True`. diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 4973014c3d4ed..31db9d682bd40 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -256,7 +256,14 @@ class MoEMixin(MixtureOfExperts): def _recursive_replace(module: nn.Module, prefix: str): for child_name, child_module in module.named_children(): qual_name = maybe_prefix(prefix, child_name) - if child_name == "experts" and isinstance(child_module, nn.ModuleList): + # Naive implementations will have experts as ModuleList + is_modulelist = isinstance(child_module, nn.ModuleList) + # Packed implementations will have experts as 3D tensors of shapes like: + # gate_up_proj = (num_experts, 2 * intermediate_size, hidden_size) + # down_proj = (num_experts, intermediate_size, hidden_size) + params = list(child_module.parameters()) + is_3d = len(params) > 0 and all(p.ndim == 3 for p in params) + if child_name == "experts" and (is_modulelist or is_3d): # Alias for readability mlp = module experts = child_module From 2c8b9182b5ced00d83bed15ef8bc0ac6e079b6ee Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Wed, 19 Nov 2025 06:13:50 -0800 Subject: [PATCH 019/249] [CI] Reorganize compile tests so new tests are automatically included in CI (#28625) Signed-off-by: Yanan Cao --- .buildkite/test-amd.yaml | 57 ++++++++--------- .buildkite/test-pipeline.yaml | 62 +++++++++---------- tests/compile/README.md | 5 ++ .../{piecewise => distributed}/__init__.py | 0 .../{ => distributed}/test_async_tp.py | 6 +- .../test_fusion_all_reduce.py | 4 +- .../{ => distributed}/test_fusions_e2e.py | 2 +- .../test_sequence_parallelism.py | 4 +- tests/compile/fullgraph/__init__.py | 0 .../{ => fullgraph}/test_basic_correctness.py | 2 +- .../test_full_cudagraph.py | 0 .../{ => fullgraph}/test_full_graph.py | 2 +- .../test_multimodal_compile.py | 0 .../test_multiple_graphs.py | 0 .../{piecewise => fullgraph}/test_simple.py | 0 .../test_toy_llama.py | 0 vllm/env_override.py | 2 +- 17 files changed, 74 insertions(+), 72 deletions(-) create mode 100644 tests/compile/README.md rename tests/compile/{piecewise => distributed}/__init__.py (100%) rename tests/compile/{ => distributed}/test_async_tp.py (99%) rename tests/compile/{ => distributed}/test_fusion_all_reduce.py (99%) rename tests/compile/{ => distributed}/test_fusions_e2e.py (99%) rename tests/compile/{ => distributed}/test_sequence_parallelism.py (99%) create mode 100644 tests/compile/fullgraph/__init__.py rename tests/compile/{ => fullgraph}/test_basic_correctness.py (99%) rename tests/compile/{piecewise => fullgraph}/test_full_cudagraph.py (100%) rename tests/compile/{ => fullgraph}/test_full_graph.py (99%) rename tests/compile/{ => fullgraph}/test_multimodal_compile.py (100%) rename tests/compile/{piecewise => fullgraph}/test_multiple_graphs.py (100%) rename tests/compile/{piecewise => fullgraph}/test_simple.py (100%) rename tests/compile/{piecewise => fullgraph}/test_toy_llama.py (100%) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 2471b509a9fff..0049f35403409 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -187,7 +187,7 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/compile/test_basic_correctness + - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py @@ -215,7 +215,7 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py @@ -493,17 +493,12 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_functionalization.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - # - pytest -v -s compile/test_sequence_parallelism.py - # - pytest -v -s compile/test_async_tp.py - - pytest -v -s compile/test_fusion_all_reduce.py - - pytest -v -s compile/test_decorator.py - - pytest -v -s compile/test_noop_elimination.py - - pytest -v -s compile/test_aot_compile.py + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -515,9 +510,11 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s compile/test_multimodal_compile.py - - pytest -v -s compile/piecewise/ + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -529,10 +526,10 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" - label: Cudagraph test timeout_in_minutes: 20 @@ -1066,10 +1063,10 @@ steps: - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -1086,14 +1083,14 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusions_e2e.py - - tests/compile/test_full_graph.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi # Run all e2e fusion tests - - pytest -v -s tests/compile/test_fusions_e2e.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: ROCm GPT-OSS Eval timeout_in_minutes: 60 @@ -1198,7 +1195,7 @@ steps: - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - - tests/compile/test_basic_correctness.py + - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py @@ -1211,7 +1208,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -1417,10 +1414,10 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/test_async_tp.py - - pytest -v -s tests/compile/test_sequence_parallelism.py - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4ac76aba67b9c..e62cd60efaec0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -167,7 +167,7 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/compile/test_basic_correctness + - tests/compile/fullgraph/test_basic_correctness.py - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py @@ -197,7 +197,7 @@ steps: - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py @@ -445,18 +445,12 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_graph_partition.py - - pytest -v -s compile/test_config.py - - pytest -v -s compile/test_pass_manager.py - - pytest -v -s compile/test_fusion.py - - pytest -v -s compile/test_fusion_attn.py - - pytest -v -s compile/test_functionalization.py - - pytest -v -s compile/test_silu_mul_quant_fusion.py - - pytest -v -s compile/test_fusion_all_reduce.py - - pytest -v -s compile/test_decorator.py - - pytest -v -s compile/test_noop_elimination.py - - pytest -v -s compile/test_aot_compile.py - - pytest -v -s compile/test_qk_norm_rope_fusion.py + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -466,9 +460,11 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s compile/test_multimodal_compile.py - - pytest -v -s compile/piecewise/ + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -479,10 +475,10 @@ steps: - tests/compile commands: # fp8 kv scales not supported on sm89, tested on Blackwell instead - - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -939,17 +935,22 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -966,12 +967,11 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusions_e2e.py - - tests/compile/test_full_graph.py + - tests/compile/distributed/test_fusions_e2e.py commands: - nvidia-smi # Run all e2e fusion tests - - pytest -v -s tests/compile/test_fusions_e2e.py + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 @@ -1069,7 +1069,7 @@ steps: - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - - tests/compile/test_basic_correctness.py + - tests/compile/fullgraph/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py @@ -1084,7 +1084,7 @@ steps: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' @@ -1264,10 +1264,10 @@ steps: working_dir: "/vllm-workspace/" num_gpus: 2 commands: - - pytest -v -s tests/compile/test_async_tp.py - - pytest -v -s tests/compile/test_sequence_parallelism.py - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 diff --git a/tests/compile/README.md b/tests/compile/README.md new file mode 100644 index 0000000000000..300a956860005 --- /dev/null +++ b/tests/compile/README.md @@ -0,0 +1,5 @@ +# compile test folder structure + +- `compile/test_*.py` : various unit tests meant for testing particular code path/features. Future tests are most likely added here. New test files added here will be included in CI automatically +- `compile/fullgraph/` : full model tests, including all tests previously in compile/piecewise. These tests do not target particular features. New test files added here will be included in CI automatically +- `compile/distributed/` : tests that require multiple GPUs. New test files added here will **NOT** be included in CI automatically as these tests generally need to be manually configured to run in runners with particular number/type of GPUs. diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/distributed/__init__.py similarity index 100% rename from tests/compile/piecewise/__init__.py rename to tests/compile/distributed/__init__.py diff --git a/tests/compile/test_async_tp.py b/tests/compile/distributed/test_async_tp.py similarity index 99% rename from tests/compile/test_async_tp.py rename to tests/compile/distributed/test_async_tp.py index 71ee228781438..86d409f1eadb0 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/distributed/test_async_tp.py @@ -27,13 +27,13 @@ from vllm.distributed.parallel_state import ( from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables -from ..models.registry import HF_EXAMPLE_MODELS -from ..utils import ( +from ...models.registry import HF_EXAMPLE_MODELS +from ...utils import ( compare_two_settings, create_new_process_for_each_test, multi_gpu_test, ) -from .backend import TestBackend +from ..backend import TestBackend FP8_DTYPE = current_platform.fp8_dtype() diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py similarity index 99% rename from tests/compile/test_fusion_all_reduce.py rename to tests/compile/distributed/test_fusion_all_reduce.py index 6d0a0ed7d89d2..d401d57032752 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/distributed/test_fusion_all_reduce.py @@ -33,8 +33,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables -from ..utils import has_module_attribute, multi_gpu_test -from .backend import TestBackend +from ...utils import has_module_attribute, multi_gpu_test +from ..backend import TestBackend class TestAllReduceRMSNormModel(torch.nn.Module): diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py similarity index 99% rename from tests/compile/test_fusions_e2e.py rename to tests/compile/distributed/test_fusions_e2e.py index f22d60ef000b2..2e1b595a43895 100644 --- a/tests/compile/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -18,7 +18,7 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer from vllm.utils.torch_utils import is_torch_equal_or_newer -from ..utils import flat_product, multi_gpu_test +from ...utils import flat_product, multi_gpu_test is_blackwell = lambda: current_platform.is_device_capability(100) """Are we running on Blackwell, a lot of tests depend on it""" diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py similarity index 99% rename from tests/compile/test_sequence_parallelism.py rename to tests/compile/distributed/test_sequence_parallelism.py index 9cd7f64b04af5..30084dfd5a950 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -32,8 +32,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables -from ..utils import multi_gpu_test -from .backend import TestBackend +from ...utils import multi_gpu_test +from ..backend import TestBackend FP8_DTYPE = current_platform.fp8_dtype() prompts = [ diff --git a/tests/compile/fullgraph/__init__.py b/tests/compile/fullgraph/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py similarity index 99% rename from tests/compile/test_basic_correctness.py rename to tests/compile/fullgraph/test_basic_correctness.py index 3f6898607f6b9..965938c4433dd 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/fullgraph/test_basic_correctness.py @@ -7,7 +7,7 @@ import pytest from vllm.config import CompilationMode from vllm.utils.torch_utils import cuda_device_count_stateless -from ..utils import compare_all_settings +from ...utils import compare_all_settings @dataclasses.dataclass diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py similarity index 100% rename from tests/compile/piecewise/test_full_cudagraph.py rename to tests/compile/fullgraph/test_full_cudagraph.py diff --git a/tests/compile/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py similarity index 99% rename from tests/compile/test_full_graph.py rename to tests/compile/fullgraph/test_full_graph.py index b4e5e56ac9fe6..2c11ecef7f029 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -15,7 +15,7 @@ from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassC from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer -from ..utils import create_new_process_for_each_test +from ...utils import create_new_process_for_each_test def models_list(*, all: bool = True, keywords: list[str] | None = None): diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py similarity index 100% rename from tests/compile/test_multimodal_compile.py rename to tests/compile/fullgraph/test_multimodal_compile.py diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/fullgraph/test_multiple_graphs.py similarity index 100% rename from tests/compile/piecewise/test_multiple_graphs.py rename to tests/compile/fullgraph/test_multiple_graphs.py diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/fullgraph/test_simple.py similarity index 100% rename from tests/compile/piecewise/test_simple.py rename to tests/compile/fullgraph/test_simple.py diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/fullgraph/test_toy_llama.py similarity index 100% rename from tests/compile/piecewise/test_toy_llama.py rename to tests/compile/fullgraph/test_toy_llama.py diff --git a/vllm/env_override.py b/vllm/env_override.py index 14dae2850c354..9ae1af3af46cf 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -95,7 +95,7 @@ def memory_plan_reuse_patched(self): # =================================================== # This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to # fix inductor partition + attention-nvfp4 quant fusion, tested in -# `tests/compile/test_fusions_e2e.py::test_attn_quant`. +# `tests/compile/distributed/test_fusions_e2e.py::test_attn_quant`. # For more context, see https://github.com/pytorch/pytorch/pull/165815. From 1ffe934c8ae978e5ed82559a1eaeca05e37f9b35 Mon Sep 17 00:00:00 2001 From: vnadathur Date: Wed, 19 Nov 2025 06:13:54 -0800 Subject: [PATCH 020/249] [torch.compile] caching of config fields should be opt-out by default (#26468) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: vnadathur Signed-off-by: WorldExplored Signed-off-by: Srreyansh Sethi Signed-off-by: Srreyansh Sethi <107075589+WorldExplored@users.noreply.github.com> Co-authored-by: WorldExplored Co-authored-by: Srreyansh Sethi <107075589+worldexplored@users.noreply.github.com> Co-authored-by: vnadathur <236933696+vnadathur@users.noreply.github.com> Co-authored-by: Luka Govedič --- tests/config/test_config_utils.py | 166 +++++++++++++++++++++++++++++ vllm/compilation/backends.py | 105 +++++++++++++++---- vllm/compilation/pass_manager.py | 2 +- vllm/config/cache.py | 31 ++++-- vllm/config/compilation.py | 40 +++---- vllm/config/model.py | 88 ++++++++-------- vllm/config/parallel.py | 49 ++++++--- vllm/config/utils.py | 119 ++++++++++++++++++++- vllm/envs.py | 167 +++++++++++++++--------------- vllm/logging_utils/__init__.py | 2 + vllm/logging_utils/lazy.py | 20 ++++ 11 files changed, 599 insertions(+), 190 deletions(-) create mode 100644 tests/config/test_config_utils.py create mode 100644 vllm/logging_utils/lazy.py diff --git a/tests/config/test_config_utils.py b/tests/config/test_config_utils.py new file mode 100644 index 0000000000000..1277c7e64eb21 --- /dev/null +++ b/tests/config/test_config_utils.py @@ -0,0 +1,166 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from enum import Enum + +import pytest + +from vllm.config.utils import get_hash_factors, hash_factors, normalize_value + +# Helpers + + +def endswith_fqname(obj, suffix: str) -> bool: + # normalize_value(type) returns fully-qualified name + # Compare suffix to avoid brittle import paths. + out = normalize_value(obj) + return isinstance(out, str) and out.endswith(suffix) + + +def expected_path(p_str: str = ".") -> str: + import pathlib + + p = pathlib.Path(p_str) + return p.expanduser().resolve().as_posix() + + +# Minimal dataclass to test get_hash_factors. +# Avoid importing heavy vLLM configs. +@dataclass +class SimpleConfig: + a: object + b: object | None = None + + +class DummyLogprobsMode(Enum): + RAW_LOGITS = "raw_logits" + + +def test_hash_factors_deterministic(): + """Test that hash_factors produces consistent SHA-256 hashes""" + factors = {"a": 1, "b": "test"} + hash1 = hash_factors(factors) + hash2 = hash_factors(factors) + + assert hash1 == hash2 + # Dict key insertion order should not affect the hash. + factors_reordered = {"b": "test", "a": 1} + assert hash_factors(factors_reordered) == hash1 + assert len(hash1) == 64 + assert all(c in "0123456789abcdef" for c in hash1) + + +@pytest.mark.parametrize( + "inp, expected", + [ + (None, None), + (True, True), + (1, 1), + (1.0, 1.0), + ("x", "x"), + (b"ab", "6162"), + (bytearray(b"ab"), "6162"), + ([1, 2], (1, 2)), + ({"b": 2, "a": 1}, (("a", 1), ("b", 2))), + ], +) +def test_normalize_value_matrix(inp, expected): + """Parametric input→expected normalization table.""" + assert normalize_value(inp) == expected + + +def test_normalize_value_enum(): + # Enums normalize to (module.QualName, value). + # DummyLogprobsMode uses a string payload. + out = normalize_value(DummyLogprobsMode.RAW_LOGITS) + assert isinstance(out, tuple) + assert out[0].endswith("DummyLogprobsMode") + # Expect string payload 'raw_logits'. + assert out[1] == "raw_logits" + + +def test_normalize_value_set_order_insensitive(): + # Sets are unordered; normalize_value sorts elements for determinism. + assert normalize_value({3, 1, 2}) == normalize_value({1, 2, 3}) + + +def test_normalize_value_path_normalization(): + from pathlib import Path # local import to avoid global dependency + + # Paths expand/resolve to absolute strings. + # Stabilizes hashing across working dirs. + assert normalize_value(Path(".")) == expected_path(".") + + +def test_normalize_value_uuid_and_to_json(): + # Objects may normalize via uuid() or to_json_string(). + class HasUUID: + def uuid(self): + return "test-uuid" + + class ToJson: + def to_json_string(self): + return '{"x":1}' + + assert normalize_value(HasUUID()) == "test-uuid" + assert normalize_value(ToJson()) == '{"x":1}' + + +@pytest.mark.parametrize( + "bad", + [ + (lambda x: x), + (type("CallableInstance", (), {"__call__": lambda self: 0}))(), + (lambda: (lambda: 0))(), # nested function instance + ], +) +def test_error_cases(bad): + """Inputs expected to raise TypeError.""" + # Reject functions/lambdas/callable instances + # to avoid under-hashing. + with pytest.raises(TypeError): + normalize_value(bad) + + +def test_enum_vs_int_disambiguation(): + # int stays primitive + nf_int = normalize_value(1) + assert nf_int == 1 + + # enum becomes ("module.QualName", value) + nf_enum = normalize_value(DummyLogprobsMode.RAW_LOGITS) + assert isinstance(nf_enum, tuple) and len(nf_enum) == 2 + enum_type, enum_val = nf_enum + assert enum_type.endswith(".DummyLogprobsMode") + assert enum_val == "raw_logits" + + # Build factor dicts from configs with int vs enum + f_int = get_hash_factors(SimpleConfig(1), set()) + f_enum = get_hash_factors(SimpleConfig(DummyLogprobsMode.RAW_LOGITS), set()) + # The int case remains a primitive value + assert f_int["a"] == 1 + # The enum case becomes a tagged tuple ("module.QualName", "raw_logits") + assert isinstance(f_enum["a"], tuple) and f_enum["a"][1] == "raw_logits" + # Factor dicts must differ so we don't collide primitives with Enums. + assert f_int != f_enum + # Hash digests must differ correspondingly + assert hash_factors(f_int) != hash_factors(f_enum) + + # Hash functions produce stable hex strings + h_int = hash_factors(f_int) + h_enum = hash_factors(f_enum) + assert isinstance(h_int, str) and len(h_int) == 64 + assert isinstance(h_enum, str) and len(h_enum) == 64 + + +def test_classes_are_types(): + """Types normalize to FQNs; include real vLLM types.""" + # Only classes allowed; functions/lambdas are rejected. + # Canonical form is the fully-qualified name. + assert isinstance(normalize_value(str), str) + + class LocalDummy: + pass + + assert endswith_fqname(LocalDummy, ".LocalDummy") diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 60ef6eef21663..1e66f21ff6388 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -4,12 +4,14 @@ import ast import dataclasses import hashlib +import json import operator import os import pprint import time from collections.abc import Callable, Sequence from contextlib import contextmanager +from functools import partial from typing import Any import torch @@ -23,7 +25,9 @@ from vllm.compilation.partition_rules import ( should_split, ) from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig +from vllm.config.utils import hash_factors from vllm.logger import init_logger +from vllm.logging_utils import lazy from vllm.platforms import current_platform from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -580,35 +584,47 @@ class VllmBackend: def __call__( self, graph: fx.GraphModule, example_inputs ) -> VllmSerializableFunction: - from .caching import _compute_code_hash, compilation_config_hash_factors - vllm_config = self.vllm_config + # Minimal hashing here with existing utilities, reused below. + + env_factors = envs.compile_factors() + env_hash = hash_factors(env_factors) + # Compute config/compiler/code hashes once and reuse + config_hash = vllm_config.compute_hash() + compiler_hash = self.compiler_manager.compute_hash(vllm_config) + forward_code_files = list(sorted(self.compilation_config.traced_files)) + + logger.debug( + "Traced files (to be considered for compilation cache):\n%s", + lazy(lambda: "\n".join(forward_code_files)), + ) + hash_content = [] + for filepath in forward_code_files: + hash_content.append(filepath) + if filepath == "": + # This means the function was dynamically generated, with + # e.g. exec(). We can't actually check these. + continue + try: + with open(filepath) as f: + hash_content.append(f.read()) + except Exception: + logger.warning("Failed to read file %s", filepath) + continue + code_hash = hashlib.sha256("\n".join(hash_content).encode()).hexdigest() + # Clear after consumption + self.compilation_config.traced_files.clear() if not self.compilation_config.cache_dir: # no provided cache dir, generate one based on the known factors # that affects the compilation. if none of the factors change, # the cache dir will be the same so that we can reuse the compiled # graph. - - factors = compilation_config_hash_factors(vllm_config) - # 2. factors come from the code files that are traced by Dynamo ( - # it mainly summarizes how the model is used in forward pass) - code_hash = _compute_code_hash(self.compilation_config.traced_files) - self.compilation_config.traced_files.clear() - factors.append(code_hash) - - # 3. compiler hash - compiler_hash = self.compiler_manager.compute_hash(vllm_config) - factors.append(compiler_hash) - - # combine all factors to generate the cache dir - hash_key = hashlib.md5( - str(factors).encode(), usedforsecurity=False - ).hexdigest()[:10] - + factors = [env_hash, config_hash, code_hash, compiler_hash] + # Use SHA-256 for cache key hashing to be consistent across + # compute_hash functions. Truncate for a short cache dir name. + hash_key = hashlib.sha256(str(factors).encode()).hexdigest()[:10] cache_dir = os.path.join( - envs.VLLM_CACHE_ROOT, - "torch_compile_cache", - hash_key, + envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key ) self.compilation_config.cache_dir = cache_dir @@ -621,6 +637,7 @@ class VllmBackend: os.makedirs(local_cache_dir, exist_ok=True) self.compilation_config.local_cache_dir = local_cache_dir + # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE. disable_cache = not is_compile_cache_enabled( self.compilation_config.inductor_compile_config ) @@ -638,6 +655,50 @@ class VllmBackend: local_cache_dir, disable_cache, self.prefix ) + # Reuses existing cache key + + logger.debug( + "torch.compile cache factors: env=%s cfg=%s comp=%s code=%s dir=%s", + env_hash, + config_hash, + compiler_hash, + code_hash, + local_cache_dir, + ) + + # Persist and log only hash-relevant factors together. + try: + logger.debug( + "Compile env factors (raw):\n%s\nVllm config hash: %s", + lazy(partial(pprint.pformat, env_factors, width=120)), + config_hash, + ) + meta_path = os.path.join(local_cache_dir, "cache_key_factors.json") + if not os.path.exists(meta_path): + with open(meta_path, "w") as f: + json.dump( + { + "env": env_factors, # raw factors used for env_hash + "config_hash": config_hash, + "code_hash": code_hash, + "compiler_hash": compiler_hash, + }, + f, + indent=2, + sort_keys=True, + ) + except Exception: + # Best-effort only; metadata write failures are non-fatal. + logger.warning( + ( + "Could not write compile cache metadata at %s; continuing without " + "metadata. Compiled cache remains valid; diagnostics may be " + "limited." + ), + local_cache_dir, + exc_info=True, + ) + # when dynamo calls the backend, it means the bytecode # transform and analysis are done compilation_counter.num_graphs_seen += 1 diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 0e8bb2fc97351..fe2547d7fecaf 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -127,7 +127,7 @@ class PostGradPassManager(CustomGraphPass): affects compilation caching. Its uuid depends on the UUIDs of all dependent passes and the pass config. See InductorPass for more info. """ - state = {"pass_config": self.pass_config.uuid(), "passes": []} + state = {"pass_config": self.pass_config.compute_hash(), "passes": []} for pass_ in self.passes: state["passes"].append(pass_.uuid()) state["passes"].append(self.fix_functionalization.uuid()) diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 864cf1be81b20..2652c7c06ad0f 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib from dataclasses import field from typing import TYPE_CHECKING, Any, Literal @@ -160,13 +159,29 @@ class CacheConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: list[Any] = [] - factors.append(self.cache_dtype) - factors.append(self.mamba_cache_dtype) - factors.append(self.mamba_ssm_cache_dtype) - # `cpu_offload_gb` does not use `torch.compile` yet. - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() - return hash_str + ignored_factors = { + # Runtime/derived knobs that don't affect compiled graph shape + "gpu_memory_utilization", + "swap_space", + "is_attention_free", + "num_gpu_blocks_override", + "enable_prefix_caching", + "prefix_caching_hash_algo", + # `cpu_offload_gb` does not use `torch.compile` yet. + "cpu_offload_gb", + "cpu_kvcache_space_bytes", + "mamba_page_size_padded", + # Post-init/derived counters + "num_gpu_blocks", + "num_cpu_blocks", + # WIP feature toggle not impacting compiled graph shape + "kv_sharing_fast_prefill", + } + + from vllm.config.utils import get_hash_factors, hash_factors + + factors = get_hash_factors(self, ignored_factors) + return hash_factors(factors) def metrics_info(self): # convert cache_config to dict(key: str, value: str) for prometheus diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 088d0b1af757a..ca01cb3fb55d5 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum -import hashlib from collections import Counter from collections.abc import Callable from dataclasses import asdict, field @@ -160,7 +159,7 @@ class PassConfig: current_platform.get_device_capability().to_int(), {} ) - def uuid(self): + def compute_hash(self) -> str: """ Produces a hash unique to the pass configuration. Any new fields that affect compilation should be added to the hash. @@ -506,28 +505,33 @@ class CompilationConfig: def compute_hash(self) -> str: """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - Provide a hash that uniquely identifies all the configs that affect the structure of the computation graph from input ids/embeddings to the final hidden states, excluding anything before input ids/embeddings and after the final hidden states. """ - factors: list[Any] = [] - factors.append(self.mode) - factors.append(self.backend) - factors.append(self.custom_ops) - factors.append(self.splitting_ops) - factors.append(self.use_inductor) - factors.append(self.use_inductor_graph_partition) - factors.append(self.inductor_compile_config) - factors.append(self.inductor_passes) - factors.append(self.pass_config.uuid()) - factors.append(self.compile_cache_save_format) - return hashlib.sha256(str(factors).encode()).hexdigest() + # Opt-out: default-include declared fields; keep a tiny exclude set; + # normalize types; keep SHA-256. For nested opaque configs, include a + # stable identifier (e.g., pass_config.compute_hash()) instead of object id. + + ignored_factors = { + # Paths/dirs and runtime/metrics that don’t affect compiled graph + "debug_dump_path", + "cache_dir", + "local_cache_dir", + "bs_to_padded_graph_size", + "traced_files", + "compilation_time", + "static_forward_context", + "pass_config", # handled separately below + } + + from vllm.config.utils import get_hash_factors, hash_factors + + factors = get_hash_factors(self, ignored_factors) + factors["pass_config"] = self.pass_config.compute_hash() + return hash_factors(factors) def __repr__(self) -> str: exclude = { diff --git a/vllm/config/model.py b/vllm/config/model.py index f61dbb6a695a2..b563a40eb8fc9 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib -import json import warnings from collections.abc import Callable from dataclasses import InitVar, field @@ -18,7 +16,7 @@ import vllm.envs as envs from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import RunnerType -from vllm.config.utils import assert_hashable, config, getattr_iter +from vllm.config.utils import config, getattr_iter from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.transformers_utils.config import ( @@ -324,50 +322,50 @@ class ModelConfig: excluding anything before input ids/embeddings and after the final hidden states. """ - factors: list[Any] = [] - factors.append(self.model) - factors.append(self.dtype) - factors.append(self.quantization) - factors.append(self.revision) - factors.append(self.code_revision) - factors.append(self.max_model_len) - factors.append(self.max_logprobs) - factors.append(self.disable_sliding_window) - factors.append(self.trust_remote_code) - factors.append(self.generation_config) - factors.append(self.model_impl) - factors.append(self.override_generation_config) - factors.append(self.video_pruning_rate) - factors.append(self.enable_prompt_embeds) + ignored_factors = { + "runner", + "convert", + "task", + "tokenizer", + "tokenizer_mode", + "seed", + "hf_config_path", + "allowed_local_media_path", + "allowed_media_domains", + "tokenizer_revision", + "spec_target_max_model_len", + "enforce_eager", + "logprobs_mode", + "disable_cascade_attn", + "skip_tokenizer_init", + "enable_prompt_embeds", + "served_model_name", + "config_format", + "hf_token", + "hf_overrides", + "logits_processor_pattern", + "enable_sleep_mode", + "override_attention_dtype", + "logits_processors", + "io_processor_plugin", + "pooler_config", + "override_pooler_config", + "multimodal_config", + "limit_mm_per_prompt", + "media_io_kwargs", + "mm_processor_kwargs", + "mm_processor_cache_gb", + "mm_processor_cache_type", + "mm_shm_cache_max_object_size_mb", + "mm_encoder_tp_mode", + "interleave_mm_strings", + "skip_mm_profiling", + } - # hf_config can control how the model looks! - try: - hf_config_json = self.hf_config.to_json_string(use_diff=False) - except TypeError: - from transformers import PretrainedConfig + from vllm.config.utils import get_hash_factors, hash_factors - from vllm.utils.jsontree import json_map_leaves - - # Handle nested HF configs with unserializable values gracefully - hf_config_json = ( - json.dumps( - json_map_leaves( - lambda v: v.to_dict() - if isinstance(v, PretrainedConfig) - else str(v), - self.hf_config.to_dict(), - ), - indent=2, - sort_keys=True, - ) - + "\n" - ) - - factors.append(hf_config_json) - - str_factors = str(factors) - assert_hashable(str_factors) - return hashlib.sha256(str(factors).encode()).hexdigest() + factors = get_hash_factors(self, ignored_factors) + return hash_factors(factors) def _update_nested( self, diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 9a6326d62e82e..0f107a7a3ef83 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import hashlib import os from typing import TYPE_CHECKING, Any, Literal @@ -448,19 +447,41 @@ class ParallelConfig: This hash is also used for DP worker configuration validation to prevent hangs from mismatched collective communication patterns. """ - factors: list[Any] = [] - factors.append(self.pipeline_parallel_size) - factors.append(self.tensor_parallel_size) - factors.append(self.enable_expert_parallel) - factors.append(self.data_parallel_size) - factors.append(self.all2all_backend) - factors.append(self.enable_eplb) - if self.enable_eplb: - factors.append(self.eplb_config.log_balancedness) - factors.append(self.eplb_config.window_size) - factors.append(self.eplb_config.step_interval) - factors.append(self.eplb_config.num_redundant_experts) - return hashlib.sha256(str(factors).encode()).hexdigest() + ignored_factors = { + # Derived/runtime topology, networking, or launch details + "data_parallel_rank", + "data_parallel_rank_local", + "data_parallel_backend", + "data_parallel_external_lb", + "data_parallel_hybrid_lb", + "data_parallel_master_ip", + "data_parallel_master_port", + "_data_parallel_master_port_list", + "data_parallel_rpc_port", + "rank", + "master_addr", + "master_port", + "node_rank", + "nnodes", + "max_parallel_loading_workers", + "disable_custom_all_reduce", + "ray_workers_use_nsight", + "ray_runtime_env", + "placement_group", + "distributed_executor_backend", + "worker_cls", + "sd_worker_cls", + "worker_extension_cls", + "_api_process_count", + "_api_process_rank", + } + + from vllm.config.utils import get_hash_factors, hash_factors + + factors = get_hash_factors(self, ignored_factors) + # Explicitly include backend affecting env factor as before + factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND) + return hash_factors(factors) def __post_init__(self) -> None: # Set all2all_backend from env var if not specified, with deprecation warning diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 7e0878d96bbd6..02f2b75f608f1 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -3,14 +3,19 @@ """Utility functions for vLLM config dataclasses.""" import ast +import enum +import hashlib import inspect +import json +import pathlib import textwrap -from collections.abc import Iterable +from collections.abc import Iterable, Mapping, Sequence, Set from dataclasses import MISSING, Field, field, fields, is_dataclass, replace from itertools import pairwise from typing import TYPE_CHECKING, Any, Protocol, TypeVar import regex as re +import torch from pydantic.fields import FieldInfo from typing_extensions import runtime_checkable @@ -176,3 +181,115 @@ def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT: ) processed_overrides[field_name] = value return replace(config, **processed_overrides) + + +def normalize_value(x): + """Return a stable, JSON-serializable canonical form for hashing. + Order: primitives, special types (Enum, callable, torch.dtype, Path), then + generic containers (Mapping/Set/Sequence) with recursion. + """ + # Fast path + if x is None or isinstance(x, (bool, int, float, str)): + return x + + # Enums: tag with FQN to avoid primitive collisions. + # Ex: Enum(1) vs int(1) -> ("module.QualName", value). + if isinstance(x, enum.Enum): + enum_type = f"{x.__class__.__module__}.{x.__class__.__qualname__}" + return (enum_type, normalize_value(x.value)) + + # Classes (types) are accepted and canonicalized by their fully-qualified + # name (module.qualname) for a stable identifier. + # Instances are only accepted if they expose uuid(); otherwise they are + # rejected to avoid under-hashing object state. + + # Callables: accept classes only; reject funcs/lambdas/methods. + # Used by LogitsProcessor types and ModelConfig.hf_overrides. + if isinstance(x, type): + module = getattr(x, "__module__", "") + qual = getattr(x, "__qualname__", getattr(x, "__name__", "")) + return ".".join([p for p in (module, qual) if p]) or repr(x) + + # Prefer stable uuid identifiers for objects that provide them, even if + # they are callable instances (e.g., InductorPass wrappers). + if hasattr(x, "uuid") and callable(getattr(x, "uuid", None)): + return x.uuid() + + if callable(x): + raise TypeError("normalize_value: function or callable instance unsupported") + + # Torch dtype: stringify (torch.float64 -> "torch.float64"). + # We rely on the string form here; dtype-bearing fields that need additional + # disambiguation should encode that at the config layer. + if isinstance(x, torch.dtype): + return str(x) + + # Bytes + if isinstance(x, (bytes, bytearray)): + return x.hex() + + # Paths (canonicalize) + if isinstance(x, pathlib.Path): + try: + return str(x.expanduser().resolve()) + except Exception: + return str(x) + + # Dataclasses: represent as (FQN, sorted(field,value) tuple) for stability. + if is_dataclass(x): + type_fqn = f"{x.__class__.__module__}.{x.__class__.__qualname__}" + items = tuple( + (f.name, normalize_value(getattr(x, f.name))) + for f in sorted(fields(x), key=lambda f: f.name) + ) + return (type_fqn, items) + + # Containers (generic) + if isinstance(x, Mapping): + return tuple(sorted((str(k), normalize_value(v)) for k, v in x.items())) + if isinstance(x, Set): + return tuple(sorted(repr(normalize_value(v)) for v in x)) + if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)): + return tuple(normalize_value(v) for v in x) + + # PretrainedConfig + if hasattr(x, "to_json_string") and callable(x.to_json_string): + return x.to_json_string() + + # Unsupported type: e.g., modules, generators, open files, or objects + # without a stable JSON/UUID representation. Hard-error to avoid + # under-hashing. + # If you hit this, either reshape your config to use supported primitives + # and containers, or extend normalize_value to provide a stable encoding + # (e.g., via uuid() or to_json_string()) for this type. + raise TypeError( + f"normalize_value: unsupported type '{type(x).__name__}'. " + "Ensure config values use supported primitives/containers or add a " + "stable representation for this type." + ) + + +def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, object]: + """Gets the factors used for hashing a config class. + - Includes all dataclass fields not in `ignored_factors`. + - Errors on non-normalizable values. + """ + factors: dict[str, object] = {} + for dc_field in fields(config): + factor = dc_field.name + if factor in ignored_factors: + continue + value = getattr(config, factor, None) + try: + factors[factor] = normalize_value(value) + except TypeError as e: + raise TypeError( + f"get_hash_factors: unsupported type for key '{factor}' " + f"({type(value).__name__})" + ) from e + return factors + + +def hash_factors(items: dict[str, object]) -> str: + """Return a SHA-256 hex digest of the canonical items structure.""" + return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest() diff --git a/vllm/envs.py b/vllm/envs.py index e61fb114325c6..212d68114e46e 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools -import hashlib import json +import logging import os import sys import tempfile @@ -426,6 +426,8 @@ def get_vllm_port() -> int | None: # --8<-- [start:env-vars-definition] +logger = logging.getLogger(__name__) + environment_variables: dict[str, Callable[[], Any]] = { # ================== Installation Time Env Vars ================== # Target device of vLLM, supporting [cuda (by default), @@ -1540,85 +1542,88 @@ def is_set(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -def compute_hash() -> str: - """ - WARNING: Whenever a new key is added to this environment - variables, ensure that it is included in the factors list if - it affects the computation graph. For example, different values - of VLLM_PP_LAYER_PARTITION will generate different computation - graphs, so it is included in the factors list. The env vars that - affect the choice of different kernels or attention backends should - also be included in the factors list. - """ +def compile_factors() -> dict[str, object]: + """Return env vars used for torch.compile cache keys. - # The values of envs may affects the computation graph. - # TODO(DefTruth): hash all environment variables? - # for key in environment_variables: - # factorize(key) - environment_variables_to_hash = [ - "VLLM_PP_LAYER_PARTITION", - "VLLM_MLA_DISABLE", - "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", - "VLLM_USE_TRITON_AWQ", - "VLLM_DP_RANK", - "VLLM_DP_SIZE", - "VLLM_USE_STANDALONE_COMPILE", - "VLLM_FUSED_MOE_CHUNK_SIZE", - "VLLM_FLASHINFER_MOE_BACKEND", - "VLLM_V1_USE_PREFILL_DECODE_ATTENTION", - "VLLM_ATTENTION_BACKEND", - "VLLM_USE_FLASHINFER_SAMPLER", - "VLLM_DISABLED_KERNELS", - "VLLM_USE_DEEP_GEMM", - "VLLM_MOE_USE_DEEP_GEMM", - "VLLM_USE_DEEP_GEMM_E8M0", - "VLLM_USE_FUSED_MOE_GROUPED_TOPK", - "VLLM_USE_FLASHINFER_MOE_FP16", - "VLLM_USE_FLASHINFER_MOE_FP8", - "VLLM_USE_FLASHINFER_MOE_FP4", - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", - "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", - "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE", - "VLLM_USE_CUDNN_PREFILL", - "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL", - "VLLM_USE_TRTLLM_ATTENTION", - "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION", - "VLLM_ROCM_USE_AITER", - "VLLM_ROCM_USE_AITER_PAGED_ATTN", - "VLLM_ROCM_USE_AITER_LINEAR", - "VLLM_ROCM_USE_AITER_MOE", - "VLLM_ROCM_USE_AITER_RMSNORM", - "VLLM_ROCM_USE_AITER_MLA", - "VLLM_ROCM_USE_AITER_MHA", - "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM", - "VLLM_ROCM_USE_AITER_TRITON_ROPE", - "VLLM_ROCM_USE_AITER_FP8BMM", - "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION", - "VLLM_ROCM_USE_AITER_TRITON_GEMM", - "VLLM_ROCM_USE_SKINNY_GEMM", - "VLLM_ROCM_FP8_PADDING", - "VLLM_ROCM_MOE_PADDING", - "VLLM_ROCM_CUSTOM_PAGED_ATTN", - "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", - "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", - "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", - "VLLM_ROCM_FP8_MFMA_PAGE_ATTN", - "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE", - "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING", - "VLLM_NVFP4_GEMM_BACKEND", - "VLLM_USE_FBGEMM", - "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", - "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL", - ] - for key in environment_variables_to_hash: - # if this goes out of sync with environment_variables, - # it's not a user error, it's a bug - assert key in environment_variables, ( - "Please update environment_variables_to_hash in envs.py" - ) + Start with every known vLLM env var; drop entries in `ignored_factors`; + hash everything else. This keeps the cache key aligned across workers.""" - factors = [environment_variables[key]() for key in environment_variables_to_hash] + ignored_factors: set[str] = { + "MAX_JOBS", + "VLLM_RPC_BASE_PATH", + "VLLM_USE_MODELSCOPE", + "VLLM_RINGBUFFER_WARNING_INTERVAL", + "VLLM_DEBUG_DUMP_PATH", + "VLLM_PORT", + "VLLM_CACHE_ROOT", + "LD_LIBRARY_PATH", + "VLLM_SERVER_DEV_MODE", + "VLLM_DP_MASTER_IP", + "VLLM_DP_MASTER_PORT", + "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", + "VLLM_CI_USE_S3", + "VLLM_MODEL_REDIRECT_PATH", + "VLLM_HOST_IP", + "S3_ACCESS_KEY_ID", + "S3_SECRET_ACCESS_KEY", + "S3_ENDPOINT_URL", + "VLLM_USAGE_STATS_SERVER", + "VLLM_NO_USAGE_STATS", + "VLLM_DO_NOT_TRACK", + "VLLM_LOGGING_LEVEL", + "VLLM_LOGGING_PREFIX", + "VLLM_LOGGING_STREAM", + "VLLM_LOGGING_CONFIG_PATH", + "VLLM_LOG_STATS_INTERVAL", + "VLLM_DEBUG_LOG_API_SERVER_RESPONSE", + "VLLM_TUNED_CONFIG_FOLDER", + "VLLM_ENGINE_ITERATION_TIMEOUT_S", + "VLLM_HTTP_TIMEOUT_KEEP_ALIVE", + "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS", + "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", + "VLLM_SLEEP_WHEN_IDLE", + "VLLM_IMAGE_FETCH_TIMEOUT", + "VLLM_VIDEO_FETCH_TIMEOUT", + "VLLM_AUDIO_FETCH_TIMEOUT", + "VLLM_MEDIA_URL_ALLOW_REDIRECTS", + "VLLM_MEDIA_LOADING_THREAD_COUNT", + "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB", + "VLLM_VIDEO_LOADER_BACKEND", + "VLLM_MEDIA_CONNECTOR", + "VLLM_ASSETS_CACHE", + "VLLM_ASSETS_CACHE_MODEL_CLEAN", + "VLLM_MM_INPUT_CACHE_GIB", + "VLLM_WORKER_MULTIPROC_METHOD", + "VLLM_ENABLE_V1_MULTIPROCESSING", + "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", + "VLLM_CPU_KVCACHE_SPACE", + "VLLM_CPU_OMP_THREADS_BIND", + "VLLM_CPU_NUM_OF_RESERVED_CPU", + "VLLM_CPU_MOE_PREPACK", + "VLLM_CPU_SGL_KERNEL", + "VLLM_TEST_FORCE_LOAD_FORMAT", + "LOCAL_RANK", + "CUDA_VISIBLE_DEVICES", + } + + from vllm.config.utils import normalize_value + + factors: dict[str, object] = {} + for factor, getter in environment_variables.items(): + if factor in ignored_factors: + continue + + try: + raw = getter() + except Exception as exc: # pragma: no cover - defensive logging + logger.warning( + "Skipping environment variable %s while hashing compile factors: %s", + factor, + exc, + ) + continue + + factors[factor] = normalize_value(raw) ray_noset_env_vars = [ # Refer to @@ -1641,8 +1646,8 @@ def compute_hash() -> str: "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR", "RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES", ] - factors.extend([os.getenv(var) for var in ray_noset_env_vars]) - hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + for var in ray_noset_env_vars: + factors[var] = normalize_value(os.getenv(var)) - return hash_str + return factors diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index 7202259ca21aa..44b40ead973ba 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -2,9 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logging_utils.formatter import NewLineFormatter +from vllm.logging_utils.lazy import lazy from vllm.logging_utils.log_time import logtime __all__ = [ "NewLineFormatter", + "lazy", "logtime", ] diff --git a/vllm/logging_utils/lazy.py b/vllm/logging_utils/lazy.py new file mode 100644 index 0000000000000..3ade798962857 --- /dev/null +++ b/vllm/logging_utils/lazy.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from typing import Any + + +class lazy: + """Wrap a zero-argument callable evaluated only during log formatting.""" + + __slots__ = ("_factory",) + + def __init__(self, factory: Callable[[], Any]) -> None: + self._factory = factory + + def __str__(self) -> str: + return str(self._factory()) + + def __repr__(self) -> str: + return str(self) From 48fc8b1e595766af9c91edfc1de43f3a352575eb Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 19 Nov 2025 10:04:07 -0500 Subject: [PATCH 021/249] [BugFix] Fix async-scheduling + FlashAttn MLA (#28990) Signed-off-by: Lucas Wilkinson --- vllm/v1/attention/backends/mla/common.py | 15 +++++++++------ vllm/v1/attention/backends/mla/flashattn_mla.py | 2 +- vllm/v1/attention/backends/utils.py | 1 + vllm/v1/worker/gpu_model_runner.py | 10 +++++++--- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 2ccdd1f143ce8..e328049b53c7e 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -755,6 +755,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): seq_lens = common_attn_metadata.seq_lens seq_lens_cpu = common_attn_metadata.seq_lens_cpu dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens + dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] @@ -944,18 +945,20 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): decode_metadata = None if num_decodes > 0: + dcp_tot_seq_lens_device = None + if self.dcp_world_size > 1: + dcp_tot_seq_lens_device = seq_lens[:num_decodes] + seq_lens_cpu = dcp_local_seq_lens_cpu + seq_lens = dcp_local_seq_lens + decode_metadata = self._build_decode( block_table_tensor=block_table_tensor[:num_decodes, ...], seq_lens_cpu=seq_lens_cpu[:num_decodes], - seq_lens_device=dcp_local_seq_lens[:num_decodes] - if self.dcp_world_size > 1 and dcp_local_seq_lens is not None - else seq_lens[:num_decodes], + seq_lens_device=seq_lens[:num_decodes], query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1], query_start_loc_device=query_start_loc[: num_decodes + 1], num_decode_tokens=num_decode_tokens, - dcp_tot_seq_lens_device=seq_lens[:num_decodes] - if self.dcp_world_size > 1 - else None, + dcp_tot_seq_lens_device=dcp_tot_seq_lens_device, ) attn_metadata = self.metadata_cls( diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 7794e89cc0a94..12639edc8b9a1 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -173,7 +173,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] ) -> FlashAttnMLADecodeMetadata: query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] max_query_len = query_lens_cpu.max().item() - max_seq_len = seq_lens_device.max().item() + max_seq_len = seq_lens_cpu.max().item() # For Flash Attention MLA + full cudagraph max_num_splits = 0 diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 578153cda7863..0dd1896331291 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -92,6 +92,7 @@ class CommonAttentionMetadata: encoder_seq_lens: np.ndarray | None = None dcp_local_seq_lens: torch.Tensor | None = None + dcp_local_seq_lens_cpu: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 506118d2d762b..3b00085b6bb99 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1451,9 +1451,12 @@ class GPUModelRunner( num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[ :num_reqs ] - dcp_local_seq_lens = ( - self.dcp_local_seq_lens.gpu[:num_reqs] if self.dcp_world_size > 1 else None - ) + + dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None + if self.dcp_world_size > 1: + dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs] + dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs] + spec_decode_common_attn_metadata = None if for_cudagraph_capture: @@ -1521,6 +1524,7 @@ class GPUModelRunner( causal=True, encoder_seq_lens=encoder_seq_lens, dcp_local_seq_lens=dcp_local_seq_lens, + dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu, ) if self.speculative_config and spec_decode_common_attn_metadata is None: From d44e9df7d49a9bb3400b002c38c06fae2dd7d1e8 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Thu, 20 Nov 2025 00:24:55 +0800 Subject: [PATCH 022/249] [Model][Mamba] Add selector for mamba attention backend and make it pluggable for other device (#26487) Signed-off-by: shen-shanshan <467638484@qq.com> --- docs/contributing/model/basic.md | 1 + vllm/attention/__init__.py | 3 +- vllm/attention/backends/registry.py | 114 +++++++++++++++--- vllm/attention/selector.py | 33 ++++- vllm/model_executor/layers/kda.py | 8 +- vllm/model_executor/layers/mamba/abstract.py | 10 +- .../layers/mamba/linear_attn.py | 14 --- .../layers/mamba/mamba_mixer.py | 10 +- .../layers/mamba/mamba_mixer2.py | 9 -- .../model_executor/layers/mamba/short_conv.py | 9 -- vllm/model_executor/models/plamo2.py | 9 -- vllm/model_executor/models/qwen3_next.py | 9 +- 12 files changed, 144 insertions(+), 85 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index a7b54f015c2da..d7f5d2f311a37 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -146,6 +146,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`. It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers. Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this. +It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend. Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it. Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this. The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended. diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index dd35165d5415e..8b4dc4013362e 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -7,7 +7,7 @@ from vllm.attention.backends.abstract import ( AttentionType, ) from vllm.attention.layer import Attention -from vllm.attention.selector import get_attn_backend +from vllm.attention.selector import get_attn_backend, get_mamba_attn_backend __all__ = [ "Attention", @@ -15,4 +15,5 @@ __all__ = [ "AttentionMetadata", "AttentionType", "get_attn_backend", + "get_mamba_attn_backend", ] diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index f07a6059be377..51899b0235915 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention backend registry""" -import enum from collections.abc import Callable +from enum import Enum, EnumMeta from typing import TYPE_CHECKING, cast from vllm.logger import init_logger @@ -15,7 +15,7 @@ if TYPE_CHECKING: logger = init_logger(__name__) -class _AttentionBackendEnumMeta(enum.EnumMeta): +class _AttentionBackendEnumMeta(EnumMeta): """Metaclass for AttentionBackendEnum to provide better error messages.""" def __getitem__(cls, name: str): @@ -23,15 +23,15 @@ class _AttentionBackendEnumMeta(enum.EnumMeta): try: return super().__getitem__(name) except KeyError: - members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values() - valid_backends = ", ".join(m.name for m in members) + members = cast("dict[str, Enum]", cls.__members__).keys() + valid_backends = ", ".join(members) raise ValueError( f"Unknown attention backend: '{name}'. " f"Valid options are: {valid_backends}" ) from None -class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta): +class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): """Enumeration of all supported attention backends. The enum value is the default class path, but this can be overridden @@ -83,7 +83,7 @@ class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta): Raises: ValueError: If Backend.CUSTOM is used without being registered """ - path = _OVERRIDES.get(self, self.value) + path = _ATTN_OVERRIDES.get(self, self.value) if not path: raise ValueError( f"Backend {self.name} must be registered before use. " @@ -111,18 +111,93 @@ class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta): Returns: True if the backend has a registered override """ - return self in _OVERRIDES + return self in _ATTN_OVERRIDES def clear_override(self) -> None: """Clear any override for this backend, reverting to the default.""" - _OVERRIDES.pop(self, None) + _ATTN_OVERRIDES.pop(self, None) -_OVERRIDES: dict[AttentionBackendEnum, str] = {} +class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): + """Enumeration of all supported mamba attention backends. + + The enum value is the default class path, but this can be overridden + at runtime using register_backend(). + + To get the actual backend class (respecting overrides), use: + backend.get_class() + """ + + MAMBA1 = "vllm.v1.attention.backends.mamba1_attn.Mamba1AttentionBackend" + MAMBA2 = "vllm.v1.attention.backends.mamba2_attn.Mamba2AttentionBackend" + SHORT_CONV = "vllm.v1.attention.backends.short_conv_attn.ShortConvAttentionBackend" + LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend" + GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend" + # Placeholder for third-party/custom backends - must be registered before use + CUSTOM = "" + + def get_path(self, include_classname: bool = True) -> str: + """Get the class path for this backend (respects overrides). + + Returns: + The fully qualified class path string + + Raises: + ValueError: If Backend.CUSTOM is used without being registered + """ + path = _MAMBA_ATTN_OVERRIDES.get(self, self.value) + if not path: + raise ValueError( + f"Backend {self.name} must be registered before use. " + f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')" + ) + if not include_classname: + path = path.rsplit(".", 1)[0] + return path + + def get_class(self) -> "type[AttentionBackend]": + """Get the backend class (respects overrides). + + Returns: + The backend class + + Raises: + ImportError: If the backend class cannot be imported + ValueError: If Backend.CUSTOM is used without being registered + """ + return resolve_obj_by_qualname(self.get_path()) + + def is_overridden(self) -> bool: + """Check if this backend has been overridden. + + Returns: + True if the backend has a registered override + """ + return self in _MAMBA_ATTN_OVERRIDES + + def clear_override(self) -> None: + """Clear any override for this backend, reverting to the default.""" + _MAMBA_ATTN_OVERRIDES.pop(self, None) + + +MAMBA_TYPE_TO_BACKEND_MAP = { + "mamba1": MambaAttentionBackendEnum.MAMBA1.name, + "mamba2": MambaAttentionBackendEnum.MAMBA2.name, + "short_conv": MambaAttentionBackendEnum.SHORT_CONV.name, + "linear_attention": MambaAttentionBackendEnum.LINEAR.name, + "gdn_attention": MambaAttentionBackendEnum.GDN_ATTN.name, + "custom": MambaAttentionBackendEnum.CUSTOM.name, +} + + +_ATTN_OVERRIDES: dict[AttentionBackendEnum, str] = {} +_MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {} def register_backend( - backend: AttentionBackendEnum, class_path: str | None = None + backend: AttentionBackendEnum | MambaAttentionBackendEnum, + is_mamba: bool = False, + class_path: str | None = None, ) -> Callable[[type], type]: """Register or override a backend implementation. @@ -135,12 +210,17 @@ def register_backend( Decorator function if class_path is None, otherwise a no-op Examples: - # Override an existing backend + # Override an existing attention backend @register_backend(AttentionBackendEnum.FLASH_ATTN) class MyCustomFlashAttn: ... - # Register a custom third-party backend + # Override an existing mamba attention backend + @register_backend(MambaAttentionBackendEnum.LINEAR, is_mamba=True) + class MyCustomMambaAttn: + ... + + # Register a custom third-party attention backend @register_backend(AttentionBackendEnum.CUSTOM) class MyCustomBackend: ... @@ -153,11 +233,17 @@ def register_backend( """ def decorator(cls: type) -> type: - _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}" + if is_mamba: + _MAMBA_ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}" # type: ignore[index] + else: + _ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}" # type: ignore[index] return cls if class_path is not None: - _OVERRIDES[backend] = class_path + if is_mamba: + _MAMBA_ATTN_OVERRIDES[backend] = class_path # type: ignore[index] + else: + _ATTN_OVERRIDES[backend] = class_path # type: ignore[index] return lambda x: x return decorator diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 1a092db9ce378..e9af08b2316d2 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -12,7 +12,11 @@ import torch import vllm.envs as envs from vllm.attention.backends.abstract import AttentionBackend -from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.backends.registry import ( + MAMBA_TYPE_TO_BACKEND_MAP, + AttentionBackendEnum, + MambaAttentionBackendEnum, +) from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.utils import STR_BACKEND_ENV_VAR @@ -197,6 +201,33 @@ def _cached_get_attn_backend( return backend +def get_mamba_attn_backend( + mamba_type: str, +) -> type[AttentionBackend]: + """Select which mamba attention backend to use and lazily import it.""" + return _cached_get_mamba_attn_backend(mamba_type) + + +@cache +def _cached_get_mamba_attn_backend( + mamba_type: str, +) -> type[AttentionBackend]: + assert mamba_type and isinstance(mamba_type, str) + + selected_backend = None + try: + backend_name = MAMBA_TYPE_TO_BACKEND_MAP[mamba_type] + selected_backend = MambaAttentionBackendEnum[backend_name] + except KeyError as e: + raise ValueError( + f"Invalid mamba attention backend type: '{backend_name}'. Valid " + f"backends are: {list(MambaAttentionBackendEnum.__members__.keys())}" + ) from e + + mamba_attn_backend = selected_backend.get_class() + return mamba_attn_backend + + @contextmanager def global_force_attn_backend_context_manager( attn_backend: AttentionBackendEnum, diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py index 2e7500bac7188..27cc3884517f9 100644 --- a/vllm/model_executor/layers/kda.py +++ b/vllm/model_executor/layers/kda.py @@ -5,7 +5,6 @@ import torch from einops import rearrange from torch import nn -from vllm.attention import AttentionBackend from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config from vllm.distributed import ( @@ -83,12 +82,7 @@ direct_register_custom_op( class KimiDeltaAttention(nn.Module, MambaBase): @property def mamba_type(self) -> str: - return "linear_attention" - - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend - - return GDNAttentionBackend + return "gdn_attention" def get_state_dtype( self, diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py index e68b09b4d81f5..aa919d6fdc35c 100644 --- a/vllm/model_executor/layers/mamba/abstract.py +++ b/vllm/model_executor/layers/mamba/abstract.py @@ -6,6 +6,7 @@ from typing import TYPE_CHECKING import torch +from vllm.attention.selector import get_mamba_attn_backend from vllm.config import VllmConfig from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec @@ -38,11 +39,6 @@ class MambaBase(AttentionLayerBase): def mamba_type(self) -> str: pass - @abstractmethod - def get_attn_backend(self) -> type["AttentionBackend"]: - """Get the attention backend class for this Mamba layer.""" - pass - @abstractmethod def get_state_dtype(self) -> tuple[torch.dtype, ...]: pass @@ -69,3 +65,7 @@ class MambaBase(AttentionLayerBase): else 0 ), ) + + def get_attn_backend(self) -> type["AttentionBackend"]: + """Get the attention backend class for this Mamba layer.""" + return get_mamba_attn_backend(self.mamba_type) diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py index 0a2742ff49a44..d85b3e61c5d61 100644 --- a/vllm/model_executor/layers/mamba/linear_attn.py +++ b/vllm/model_executor/layers/mamba/linear_attn.py @@ -2,12 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - -from typing import TYPE_CHECKING import torch import torch.nn.functional as F @@ -37,9 +31,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend - class MiniMaxText01RMSNormTP(CustomOp): name = "MiniMaxText01RMSNormTP" @@ -123,11 +114,6 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase): def mamba_type(self) -> str: return "linear_attention" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend - - return LinearAttentionBackend - def get_state_dtype(self) -> tuple[torch.dtype]: assert self.model_config is not None assert self.cache_config is not None diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index b6345b8af7f0a..90e520e244416 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -1,10 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING, NamedTuple - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend +from typing import NamedTuple import torch from torch import nn @@ -452,11 +449,6 @@ class MambaMixer(MambaBase, CustomOp): def mamba_type(self) -> str: return "mamba1" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend - - return Mamba1AttentionBackend - def _time_proj_bias(self) -> torch.Tensor | None: if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None: return self.dt_proj.bias.float() diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 57313990b8206..900701c46348b 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -1,10 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend import torch from torch import nn @@ -908,11 +904,6 @@ class MambaMixer2(MambaBase, CustomOp): def mamba_type(self) -> str: return "mamba2" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend - - return Mamba2AttentionBackend - def mamba_mixer2( projected_states: torch.Tensor, diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index 04efa8a8b3734..0bbad17d7ebc7 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -1,10 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend import torch @@ -232,11 +228,6 @@ class ShortConv(MambaBase, CustomOp): def mamba_type(self) -> str: return "short_conv" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend - - return ShortConvAttentionBackend - def short_conv( hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 0c87f5000ff45..52c9755e0e0ea 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -4,10 +4,6 @@ from collections.abc import Iterable from itertools import islice -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionBackend import torch from torch import nn @@ -467,11 +463,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp): def mamba_type(self) -> str: return "mamba2" - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend - - return Mamba2AttentionBackend - def plamo2_mamba_mixer( hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 0415c8e00fdfa..ad631f61e4b93 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -10,7 +10,7 @@ from einops import rearrange from torch import nn from transformers.activations import ACT2FN -from vllm.attention import Attention, AttentionBackend, AttentionMetadata +from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import ( CacheConfig, @@ -216,12 +216,7 @@ class Qwen3NextSparseMoeBlock(nn.Module): class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): @property def mamba_type(self) -> str: - return "linear_attention" - - def get_attn_backend(self) -> type["AttentionBackend"]: - from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend - - return GDNAttentionBackend + return "gdn_attention" def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: return MambaStateDtypeCalculator.gated_delta_net_state_dtype( From a8b70304d68497ac1c432a2ff343e9bfb516c227 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 19 Nov 2025 18:06:36 +0100 Subject: [PATCH 023/249] Update `rope_scaling` to `rope_parameters` in preparation for Transformers v5 (#28542) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 6 +- benchmarks/kernels/benchmark_mrope.py | 19 ++-- .../offline_inference/context_extension.py | 6 +- tests/compile/test_functionalization.py | 4 +- tests/kernels/core/test_mrope.py | 16 +-- tests/kernels/core/test_pos_encoding.py | 39 +++---- .../moe/test_gpt_oss_triton_kernels.py | 2 +- .../pooling/test_nomic_max_model_len.py | 16 +-- tests/test_config.py | 37 ++++--- vllm/config/model.py | 59 +++++------ .../layers/rotary_embedding/__init__.py | 76 ++++++------- vllm/model_executor/models/afmoe.py | 17 +-- vllm/model_executor/models/apertus.py | 22 +--- vllm/model_executor/models/arcee.py | 11 -- vllm/model_executor/models/arctic.py | 3 +- vllm/model_executor/models/baichuan.py | 8 +- vllm/model_executor/models/bailing_moe.py | 3 +- vllm/model_executor/models/bamba.py | 6 +- vllm/model_executor/models/chameleon.py | 29 +---- vllm/model_executor/models/chatglm.py | 3 +- vllm/model_executor/models/commandr.py | 5 +- vllm/model_executor/models/config.py | 22 ++-- vllm/model_executor/models/dbrx.py | 7 +- vllm/model_executor/models/deepseek_v2.py | 43 +++----- vllm/model_executor/models/dots1.py | 11 +- vllm/model_executor/models/ernie45_moe.py | 14 +-- vllm/model_executor/models/ernie45_vl_moe.py | 13 +-- vllm/model_executor/models/exaone.py | 21 +--- vllm/model_executor/models/exaone4.py | 19 +--- vllm/model_executor/models/falcon.py | 3 +- vllm/model_executor/models/falcon_h1.py | 8 +- vllm/model_executor/models/gemma.py | 8 +- vllm/model_executor/models/gemma2.py | 5 +- vllm/model_executor/models/gemma3.py | 21 ++-- vllm/model_executor/models/gemma3n.py | 20 ++-- vllm/model_executor/models/glm4.py | 10 +- vllm/model_executor/models/glm4_1v.py | 1 - vllm/model_executor/models/glm4_moe.py | 11 +- vllm/model_executor/models/gpt_j.py | 3 +- vllm/model_executor/models/gpt_neox.py | 3 +- vllm/model_executor/models/gpt_oss.py | 13 ++- vllm/model_executor/models/granite.py | 17 +-- vllm/model_executor/models/granitemoe.py | 13 +-- .../model_executor/models/granitemoehybrid.py | 5 +- .../model_executor/models/granitemoeshared.py | 6 +- vllm/model_executor/models/grok1.py | 11 +- vllm/model_executor/models/hunyuan_v1.py | 25 +---- vllm/model_executor/models/internlm2.py | 12 +-- vllm/model_executor/models/internlm2_ve.py | 5 +- vllm/model_executor/models/kimi_linear.py | 5 - vllm/model_executor/models/lfm2.py | 17 +-- vllm/model_executor/models/lfm2_moe.py | 17 +-- vllm/model_executor/models/llama.py | 22 +--- vllm/model_executor/models/llama4.py | 11 +- vllm/model_executor/models/longcat_flash.py | 22 ++-- vllm/model_executor/models/minicpm.py | 12 +-- vllm/model_executor/models/minicpm3.py | 10 +- vllm/model_executor/models/minicpm_eagle.py | 5 +- vllm/model_executor/models/minimax_m2.py | 12 +-- vllm/model_executor/models/minimax_text_01.py | 9 +- vllm/model_executor/models/mixtral.py | 7 +- vllm/model_executor/models/mllama4.py | 8 +- vllm/model_executor/models/molmo.py | 3 +- vllm/model_executor/models/nemotron.py | 17 +-- vllm/model_executor/models/nemotron_nas.py | 19 +--- vllm/model_executor/models/olmo.py | 3 +- vllm/model_executor/models/olmo2.py | 13 +-- vllm/model_executor/models/olmoe.py | 6 +- vllm/model_executor/models/openpangu.py | 26 ++--- vllm/model_executor/models/orion.py | 12 +-- vllm/model_executor/models/ouro.py | 11 +- vllm/model_executor/models/persimmon.py | 3 +- vllm/model_executor/models/phi.py | 6 +- vllm/model_executor/models/phimoe.py | 18 ++-- vllm/model_executor/models/plamo2.py | 7 +- vllm/model_executor/models/qwen.py | 11 +- vllm/model_executor/models/qwen2.py | 16 +-- vllm/model_executor/models/qwen2_5_vl.py | 1 - vllm/model_executor/models/qwen2_moe.py | 12 +-- vllm/model_executor/models/qwen2_vl.py | 1 - vllm/model_executor/models/qwen3.py | 15 +-- vllm/model_executor/models/qwen3_moe.py | 12 +-- vllm/model_executor/models/qwen3_next.py | 3 +- .../models/qwen3_omni_moe_thinker.py | 1 - vllm/model_executor/models/qwen3_vl.py | 1 - vllm/model_executor/models/seed_oss.py | 15 +-- vllm/model_executor/models/solar.py | 18 +--- vllm/model_executor/models/stablelm.py | 2 +- vllm/model_executor/models/starcoder2.py | 3 +- vllm/model_executor/models/step3_text.py | 16 ++- .../models/transformers/utils.py | 10 +- vllm/model_executor/models/zamba2.py | 4 +- vllm/transformers_utils/config.py | 100 +++++++++++++----- vllm/transformers_utils/configs/afmoe.py | 7 +- vllm/transformers_utils/configs/arctic.py | 18 +++- vllm/transformers_utils/configs/flex_olmo.py | 17 +-- .../transformers_utils/configs/kimi_linear.py | 12 ++- vllm/transformers_utils/configs/lfm2_moe.py | 12 ++- .../transformers_utils/configs/midashenglm.py | 2 +- vllm/transformers_utils/configs/mistral.py | 4 +- vllm/transformers_utils/configs/nemotron.py | 60 ++++++----- vllm/transformers_utils/configs/olmo3.py | 12 ++- vllm/transformers_utils/configs/qwen3_next.py | 17 +-- vllm/transformers_utils/configs/step3_vl.py | 12 ++- 104 files changed, 542 insertions(+), 910 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e62cd60efaec0..d4b6f4077ab32 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -872,12 +872,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' + - pytest -v -s tests/models/test_initialization.py -k 'not (Ultravox or Phi4Multimodal or MiniCPMO or Lfm2Moe or RobertaForSequenceClassification or Ovis2_5 or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py # - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' + - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index cb848d2bf579e..83bd91917508f 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -6,7 +6,7 @@ # # The CSV file (named with current date/time) contains these columns: # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position, -# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99, +# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99, # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max, # speedup # @@ -86,9 +86,8 @@ def benchmark_mrope( num_heads: int, num_kv_heads: int, max_position: int = 8192, - rope_theta: float = 10000, is_neox_style: bool = True, - rope_scaling: dict[str, Any] = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype = torch.bfloat16, seed: int = 0, warmup_iter: int = 10, @@ -102,9 +101,8 @@ def benchmark_mrope( head_size=head_dim, rotary_dim=head_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dtype=dtype, ).to(device=device) @@ -203,9 +201,8 @@ def benchmark_mrope( num_kv_heads, head_dim, max_position, - rope_theta, is_neox_style, - str(rope_scaling), + str(rope_parameters), str(dtype).split(".")[-1], torch_stats["mean"], torch_stats["median"], @@ -255,9 +252,8 @@ if __name__ == "__main__": "num_kv_heads", "head_dim", "max_position", - "rope_theta", "is_neox_style", - "rope_scaling", + "rope_parameters", "dtype", "torch_mean", "torch_median", @@ -303,7 +299,7 @@ if __name__ == "__main__": q_size = num_heads * head_dim kv_size = num_kv_heads * head_dim is_neox_style = True - rope_theta = config.rope_theta + rope_parameters = config.rope_parameters max_position = config.max_position_embeddings for num_tokens in num_tokens_list: @@ -315,9 +311,8 @@ if __name__ == "__main__": num_heads=num_heads, num_kv_heads=num_kv_heads, max_position=max_position, - rope_theta=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=rope_parameters, dtype=getattr(torch, args.dtype), seed=args.seed, warmup_iter=args.warmup_iter, diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py index df39e4c25d5c8..67d33e1881ee9 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/offline_inference/context_extension.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This script demonstrates how to extend the context length -of a Qwen model using the YARN method (rope_scaling) +of a Qwen model using the YARN method (rope_parameters) and run a simple chat example. Usage: @@ -19,8 +19,8 @@ def create_llm(): # Use yarn to extend context hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 11ae96e930da7..515e0a93ac2a8 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -137,7 +137,7 @@ class TestRotaryEmbedding(torch.nn.Module): self.head_dim, rotary_dim=self.rotary_dim, max_position=max_position, - base=base, + rope_parameters={"rope_type": "default", "rope_theta": base}, ) def forward(self, positions, q, k): @@ -172,7 +172,7 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=base, + rope_parameters={"rope_type": "default", "rope_theta": base}, ) def forward(self, positions, hidden_states): diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 02b795721f46e..43b242ab2d586 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -5,11 +5,11 @@ from typing import NamedTuple import pytest import torch from packaging.version import Version -from transformers import AutoConfig from transformers import __version__ as TRANSFORMERS_VERSION from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform +from vllm.transformers_utils.config import get_config device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -98,8 +98,7 @@ def test_mrope( atol = model_info.atol rtol = model_info.rtol - config = AutoConfig.from_pretrained(model_name) - config = config.get_text_config() + config = get_config(model_name, False).get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads @@ -113,7 +112,6 @@ def test_mrope( ) is_neox_style = True - rope_theta = config.rope_theta max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -122,9 +120,8 @@ def test_mrope( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) @@ -173,8 +170,7 @@ def test_mrope_torch_compile_tracing( atol = model_info.atol rtol = model_info.rtol - config = AutoConfig.from_pretrained(model_name) - config = config.get_text_config() + config = get_config(model_name, False).get_text_config() # get the model config total_num_kv_heads = config.num_key_value_heads @@ -187,7 +183,6 @@ def test_mrope_torch_compile_tracing( else config.hidden_size // total_num_heads ) is_neox_style = True - rope_theta = config.rope_theta max_position = config.max_position_embeddings partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) rotary_dim = int(head_dim * partial_rotary_factor) @@ -196,9 +191,8 @@ def test_mrope_torch_compile_tracing( head_size=head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=rope_theta, is_neox_style=is_neox_style, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, dtype=dtype, ).to(device=device) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index c35ee5016ba05..a8ed3825689d3 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -74,7 +74,7 @@ def test_rotary_embedding( device: str, use_key: bool, max_position: int = 8192, - base: float = 10000, + rope_theta: float = 10000, ) -> None: if rotary_dim is None: rotary_dim = head_size @@ -83,7 +83,8 @@ def test_rotary_embedding( torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=torch.get_default_device()) positions = torch.randint(0, max_position, (batch_size, seq_len)) @@ -120,9 +121,9 @@ def test_rotary_embedding( @torch.inference_mode() def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] - BASES = [10000, 1000000] - ROPE_SCALINGS = ( - None, + ROPE_THETAS = [10000, 1000000] + ROPE_PARAMETERS = ( + {"rope_type": "default"}, {"rope_type": "linear", "factor": (1,)}, {"rope_type": "dynamic", "factor": 1}, ) @@ -130,9 +131,9 @@ def test_rope_module_cache(): HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, - BASES, + ROPE_THETAS, IS_NEOX_STYLE, - ROPE_SCALINGS, + ROPE_PARAMETERS, DTYPES, ) rope_setting_id_map: dict[str, int] = {} @@ -141,20 +142,20 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + rope_theta, + is_neox_style, + rope_parameters, dtype, ) = setting if rotary_dim is None: rotary_dim = head_size + rope_parameters["rope_theta"] = rope_theta rope = get_rope( head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) # different settings cannot share the same rope module @@ -168,20 +169,20 @@ def test_rope_module_cache(): head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + rope_theta, + is_neox_style, + rope_parameters, dtype, ) = setting if rotary_dim is None: rotary_dim = head_size + rope_parameters["rope_theta"] = rope_theta rope = get_rope( head_size, rotary_dim, max_position, - base, - is_neox_stype, - rope_scaling, + is_neox_style, + rope_parameters, dtype, ) # check if cache take effect diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index dfd317bcf72f1..af33fd4e3fc3b 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -201,7 +201,7 @@ class ModelConfig: sliding_window: int = 128 initial_context_length: int = 4096 rope_theta: float = 150000.0 - rope_scaling_factor: float = 32.0 + rope_parameters_factor: float = 32.0 rope_ntk_alpha: float = 1.0 rope_ntk_beta: float = 32.0 diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 88f088c603276..d6216a87a229e 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 +from typing import Any + import pytest from ...utils import EmbedModelInfo @@ -79,8 +81,8 @@ def test_set_max_model_len_illegal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_use_rope_scaling_legal(model_info, vllm_runner): hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -96,9 +98,9 @@ def test_use_rope_scaling_legal(model_info, vllm_runner): @pytest.mark.parametrize("model_info", MODELS) def test_use_rope_scaling_illegal(model_info, vllm_runner): - hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + hf_overrides: dict[str, Any] = { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, @@ -115,8 +117,8 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner): pass hf_overrides = { - "rope_theta": rope_theta, - "rope_scaling": { + "rope_parameters": { + "rope_theta": rope_theta, "rope_type": "yarn", "factor": factor, "original_max_position_embeddings": original_max_position_embeddings, diff --git a/tests/test_config.py b/tests/test_config.py index bba2fbec3db29..16f68d18fc68b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -249,45 +249,48 @@ def test_get_bert_tokenization_sentence_transformer_config(): def test_rope_customization(): - TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} - TEST_ROPE_THETA = 16_000_000.0 - LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0} + TEST_ROPE_PARAMETERS = { + "rope_theta": 16_000_000.0, + "rope_type": "dynamic", + "factor": 2.0, + } + LLAMA_ROPE_PARAMETERS = {"rope_theta": 500000.0, "rope_type": "default"} + LONGCHAT_ROPE_PARAMETERS = {"rope_type": "linear", "factor": 8.0} llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct") - assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None - assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000 + assert ( + getattr(llama_model_config.hf_config, "rope_parameters", None) + == LLAMA_ROPE_PARAMETERS + ) assert llama_model_config.max_model_len == 8192 llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", - hf_overrides={ - "rope_scaling": TEST_ROPE_SCALING, - "rope_theta": TEST_ROPE_THETA, - }, + hf_overrides={"rope_parameters": TEST_ROPE_PARAMETERS}, ) assert ( - getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING + getattr(llama_model_config.hf_config, "rope_parameters", None) + == TEST_ROPE_PARAMETERS ) - assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA assert llama_model_config.max_model_len == 16384 longchat_model_config = ModelConfig("lmsys/longchat-13b-16k") - # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config + # Check if LONGCHAT_ROPE_PARAMETERS entries are in longchat_model_config assert all( - longchat_model_config.hf_config.rope_scaling.get(key) == value - for key, value in LONGCHAT_ROPE_SCALING.items() + longchat_model_config.hf_config.rope_parameters.get(key) == value + for key, value in LONGCHAT_ROPE_PARAMETERS.items() ) assert longchat_model_config.max_model_len == 16384 longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", hf_overrides={ - "rope_scaling": TEST_ROPE_SCALING, + "rope_parameters": TEST_ROPE_PARAMETERS, }, ) assert ( - getattr(longchat_model_config.hf_config, "rope_scaling", None) - == TEST_ROPE_SCALING + getattr(longchat_model_config.hf_config, "rope_parameters", None) + == TEST_ROPE_PARAMETERS ) assert longchat_model_config.max_model_len == 4096 diff --git a/vllm/config/model.py b/vllm/config/model.py index b563a40eb8fc9..d1e56a72a318b 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -11,6 +11,7 @@ import torch from pydantic import ConfigDict, SkipValidation, field_validator, model_validator from pydantic.dataclasses import dataclass from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from transformers.configuration_utils import ALLOWED_LAYER_TYPES import vllm.envs as envs from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig @@ -2100,31 +2101,32 @@ def _get_and_verify_max_len( ) derived_max_model_len = default_max_len - rope_scaling = getattr(hf_config, "rope_scaling", None) + # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict]. + # To simplify the verification, we convert it to dict[str, TypedDict]. + rope_parameters = getattr(hf_config, "rope_parameters", None) + if rope_parameters and not set(rope_parameters.keys()).issubset( + ALLOWED_LAYER_TYPES + ): + rope_parameters = {"": rope_parameters} + # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE # scaling, so we skip applying the scaling factor again. - if rope_scaling is not None and "gemma3" not in hf_config.model_type: - # No need to consider "type" key because of patch_rope_scaling when - # loading HF config - rope_type = rope_scaling["rope_type"] + if rope_parameters is not None and "gemma3" not in hf_config.model_type: + scaling_factor = 1.0 + for rp in rope_parameters.values(): + # No need to consider "type" key because of patch_rope_parameters when + # loading HF config + rope_type = rp["rope_type"] - if rope_type not in ("su", "longrope", "llama3"): - if disable_sliding_window: - # TODO(robertgshaw): Find a model that supports rope_scaling - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "with rope_scaling. Please raise an issue so we can " - "investigate." - ) + if rope_type not in ("su", "longrope", "llama3"): + # NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py + # NOTE: This assumes all layer types have the same scaling factor. + scaling_factor = rp.get("factor", scaling_factor) - # NOTE: rope_type == "default" does not define factor - # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py - scaling_factor = rope_scaling.get("factor", 1.0) - - if rope_type == "yarn": - derived_max_model_len = rope_scaling["original_max_position_embeddings"] - derived_max_model_len *= scaling_factor + if rope_type == "yarn": + derived_max_model_len = rp["original_max_position_embeddings"] + # Do this outside loop since all layer types should have the same scaling + derived_max_model_len *= scaling_factor if encoder_config and "max_seq_length" in encoder_config: derived_max_model_len = encoder_config["max_seq_length"] @@ -2134,7 +2136,9 @@ def _get_and_verify_max_len( if max_model_len is None: # For LongRoPE, default to original_max_position_embeddings to avoid # performance degradation for shorter sequences - if rope_scaling is not None and rope_scaling["rope_type"] == "longrope": + if rope_parameters is not None and any( + rp["rope_type"] == "longrope" for rp in rope_parameters.values() + ): max_model_len = int( getattr( hf_config, "original_max_position_embeddings", derived_max_model_len @@ -2151,16 +2155,7 @@ def _get_and_verify_max_len( # that will be bigger than derived_max_model_len. We compare user input # with model_max_length and allow this override when it's smaller. model_max_length = getattr(hf_config, "model_max_length", None) - if model_max_length is not None and max_model_len <= model_max_length: - if disable_sliding_window: - # TODO(robertgshaw): Find a model that has model_max_length - # with sliding window to see if this case should be allowed. - raise NotImplementedError( - "Disabling sliding window is not supported for models " - "model_max_length in the config. Please raise an issue " - "so we can investigate." - ) - else: + if model_max_length is None or max_model_len > model_max_length: msg = ( f"User-specified max_model_len ({max_model_len}) is greater " f"than the derived max_model_len ({max_len_key}=" diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 56c165f9c041a..ae8a7d93b50e4 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -26,23 +26,23 @@ def get_rope( head_size: int, rotary_dim: int, max_position: int, - base: float, is_neox_style: bool = True, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, dtype: torch.dtype | None = None, partial_rotary_factor: float = 1.0, dual_chunk_attention_config: dict[str, Any] | None = None, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() - if rope_scaling is not None: + if rope_parameters is not None: # Transforms every value that is a list into a tuple for caching calls - rope_scaling_tuple = { - k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items() + rope_parameters_tuple = { + k: tuple(v) if isinstance(v, list) else v + for k, v in rope_parameters.items() } - rope_scaling_args = tuple(rope_scaling_tuple.items()) + rope_parameters_args = tuple(rope_parameters_tuple.items()) else: - rope_scaling_args = None + rope_parameters_args = None if dual_chunk_attention_config is not None: dual_chunk_attention_tuple = { @@ -60,15 +60,15 @@ def get_rope( head_size, rotary_dim, max_position, - base, is_neox_style, - rope_scaling_args, + rope_parameters_args, dual_chunk_attention_args, dtype, ) if key in _ROPE_DICT: return _ROPE_DICT[key] + base = rope_parameters["rope_theta"] if rope_parameters else 10000 if dual_chunk_attention_config is not None: extra_kwargs = { k: v @@ -84,18 +84,18 @@ def get_rope( dtype, **extra_kwargs, ) - elif not rope_scaling: + elif not rope_parameters: rotary_emb = RotaryEmbedding( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) else: - scaling_type = rope_scaling["rope_type"] + scaling_type = rope_parameters["rope_type"] if scaling_type == "llama3": - scaling_factor = rope_scaling["factor"] - low_freq_factor = rope_scaling["low_freq_factor"] - high_freq_factor = rope_scaling["high_freq_factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] rotary_emb = Llama3RotaryEmbedding( head_size, rotary_dim, @@ -113,7 +113,7 @@ def get_rope( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) elif scaling_type == "default": - if "mrope_section" in rope_scaling: + if "mrope_section" in rope_parameters: rotary_emb = MRotaryEmbedding( head_size, rotary_dim, @@ -121,8 +121,8 @@ def get_rope( base, is_neox_style, dtype, - mrope_section=rope_scaling["mrope_section"], - mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), ) else: rotary_emb = RotaryEmbedding( @@ -134,7 +134,7 @@ def get_rope( dtype, ) elif scaling_type == "linear": - scaling_factor = rope_scaling["factor"] + scaling_factor = rope_parameters["factor"] rotary_emb = LinearScalingRotaryEmbedding( head_size, rotary_dim, @@ -145,8 +145,8 @@ def get_rope( dtype, ) elif scaling_type == "ntk": - scaling_factor = rope_scaling["factor"] - mixed_b = rope_scaling.get("mixed_b", None) + scaling_factor = rope_parameters["factor"] + mixed_b = rope_parameters.get("mixed_b") rotary_emb = NTKScalingRotaryEmbedding( head_size, rotary_dim, @@ -158,8 +158,8 @@ def get_rope( mixed_b, ) elif scaling_type == "dynamic": - if "alpha" in rope_scaling: - scaling_alpha = rope_scaling["alpha"] + if "alpha" in rope_parameters: + scaling_alpha = rope_parameters["alpha"] rotary_emb = DynamicNTKAlphaRotaryEmbedding( head_size, rotary_dim, @@ -169,8 +169,8 @@ def get_rope( scaling_alpha, dtype, ) - elif "factor" in rope_scaling: - scaling_factor = rope_scaling["factor"] + elif "factor" in rope_parameters: + scaling_factor = rope_parameters["factor"] rotary_emb = DynamicNTKScalingRotaryEmbedding( head_size, rotary_dim, @@ -185,11 +185,11 @@ def get_rope( "Dynamic rope scaling must contain either 'alpha' or 'factor' field" ) elif scaling_type == "yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ( "extrapolation_factor", @@ -199,7 +199,7 @@ def get_rope( "apply_yarn_scaling", ) } - if "mrope_section" in rope_scaling: + if "mrope_section" in rope_parameters: extra_kwargs.pop("apply_yarn_scaling", None) rotary_emb = MRotaryEmbedding( head_size, @@ -208,8 +208,8 @@ def get_rope( base, is_neox_style, dtype, - mrope_section=rope_scaling["mrope_section"], - mrope_interleaved=rope_scaling.get("mrope_interleaved", False), + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), scaling_factor=scaling_factor, **extra_kwargs, ) @@ -225,12 +225,12 @@ def get_rope( **extra_kwargs, ) elif scaling_type == "deepseek_yarn": - scaling_factor = rope_scaling["factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] # assert max_position == original_max_position * scaling_factor extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ( "extrapolation_factor", @@ -252,12 +252,12 @@ def get_rope( **extra_kwargs, ) elif scaling_type == "longrope": - short_factor = rope_scaling["short_factor"] - long_factor = rope_scaling["long_factor"] - original_max_position = rope_scaling["original_max_position_embeddings"] + short_factor = rope_parameters["short_factor"] + long_factor = rope_parameters["long_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] extra_kwargs = { k: v - for k, v in rope_scaling.items() + for k, v in rope_parameters.items() if k in ("short_mscale", "long_mscale") } rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 6f654f47495f7..4eb5665a71fc8 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -5,7 +5,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -171,8 +170,6 @@ class AfmoeAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -202,7 +199,6 @@ class AfmoeAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings # Check if this is a local attention layer @@ -246,8 +242,7 @@ class AfmoeAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config["rope_parameters"], is_neox_style=True, ) else: @@ -303,14 +298,6 @@ class AfmoeDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix @@ -323,8 +310,6 @@ class AfmoeDecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 0a8f21abb0a35..b75e91319bbad 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -118,8 +117,6 @@ class ApertusAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -155,7 +152,6 @@ class ApertusAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,9 +172,7 @@ class ApertusAttention(nn.Module): prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) sliding_window = None if layer_types := getattr(config, "layer_types", None): @@ -224,7 +218,6 @@ class ApertusAttention(nn.Module): def _init_rotary_emb( self, config: ApertusConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -236,8 +229,7 @@ class ApertusAttention(nn.Module): self.head_dim, rotary_dim=int(self.partial_rotary_factor * self.head_dim), max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -253,14 +245,6 @@ class ApertusDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -288,8 +272,6 @@ class ApertusDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index 20c3ff0754506..b3887b16f4d74 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -103,15 +103,6 @@ class ArceeDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Rotary embedding parameters (reuse LLaMA defaults) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Determine if attention bias is needed (some variants use bias terms) attention_bias = getattr(config, "attention_bias", False) or getattr( @@ -133,8 +124,6 @@ class ArceeDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index b5cc07a56535d..b75a254761d4e 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -292,7 +292,6 @@ class ArcticAttention(nn.Module): self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.scaling = self.head_dim**-0.5 self.qkv_proj = QKVParallelLinear( @@ -317,7 +316,7 @@ class ArcticAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 8991ef4c606b6..edf47270e5277 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -136,7 +136,7 @@ class BaiChuanAttention(nn.Module): hidden_size: int, num_heads: int, position_embedding: str, - rope_theta: float = 10000, + rope_parameters: dict, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -150,7 +150,6 @@ class BaiChuanAttention(nn.Module): self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = hidden_size // self.total_num_heads self.position_embedding = position_embedding - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings # pylint: disable=invalid-name @@ -192,7 +191,7 @@ class BaiChuanAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( @@ -229,13 +228,12 @@ class BaiChuanDecoderLayer(nn.Module): ): super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = BaiChuanAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, position_embedding=position_embedding, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 024425bb24406..cc10e936a2d3d 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -135,9 +135,8 @@ class BailingAttention(nn.Module): self.head_dim, rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, - rope_scaling=config.rope_scaling, partial_rotary_factor=self.partial_rotary_factor, ) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index c6cc83487fec2..4422bb5da98f4 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -156,8 +156,6 @@ class BambaAttentionDecoderLayer(nn.Module): prefix: str = "", ) -> None: super().__init__() - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -178,7 +176,6 @@ class BambaAttentionDecoderLayer(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): @@ -192,8 +189,7 @@ class BambaAttentionDecoderLayer(nn.Module): head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_scaling=rope_scaling, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, dtype=torch.get_default_dtype(), # see impl of get_rope ) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 3c87bbfefab3d..b5a6d00dc309f 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -265,8 +265,7 @@ class ChameleonAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 4096, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -293,7 +292,6 @@ class ChameleonAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -318,8 +316,7 @@ class ChameleonAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( @@ -369,14 +366,6 @@ class ChameleonDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = ChameleonAttention( @@ -385,8 +374,7 @@ class ChameleonDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, @@ -439,14 +427,6 @@ class ChameleonSwinDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) self.self_attn = ChameleonAttention( @@ -455,8 +435,7 @@ class ChameleonSwinDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 5d6f5e9125a28..dbfcd62d0bcab 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -99,6 +99,7 @@ class GLMAttention(nn.Module): # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) + rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio} # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False, # which is equivalent to is_neox_style=True is_neox_style = not config.original_rope @@ -106,7 +107,7 @@ class GLMAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim // 2, max_position=max_positions, - base=10000 * rope_ratio, + rope_parameters=rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 77bb178519813..5ed920927c772 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -156,8 +156,6 @@ class CohereAttention(nn.Module): self.max_position_embeddings = getattr( config, "model_max_length", None ) or getattr(config, "max_position_embeddings", 8192) - self.rope_theta = config.rope_theta - self.rope_scaling = getattr(config, "rope_scaling", None) self.use_qk_norm = getattr(config, "use_qk_norm", False) self.qkv_proj = QKVParallelLinear( self.hidden_size, @@ -179,8 +177,7 @@ class CohereAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 66b246878b0aa..3cf4bf991e667 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -8,6 +8,7 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform +from vllm.transformers_utils.config import set_default_rope_theta from vllm.utils.math_utils import cdiv, round_up from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec @@ -46,8 +47,7 @@ class GteNewModelConfig(VerifyAndUpdateConfig): "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": config.rope_theta, - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } @@ -78,12 +78,13 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig): if not model_config.enforce_eager: max_position = round_up(max_position, 8) + set_default_rope_theta(config, default_theta=config.rotary_emb_base) + config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": max_position, - "base": getattr(config, "rope_theta", config.rotary_emb_base), - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } @@ -117,18 +118,20 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): head_dim = config.hidden_size // config.num_attention_heads rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) + + set_default_rope_theta(config, default_theta=config.rotary_emb_base) + config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, "max_position": max_trained_positions, - "base": getattr(config, "rope_theta", config.rotary_emb_base), - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } # we ignore config.rotary_scaling_factor so that for datasets shorter # than max_trained_positions 2048, the results are consistent # with SentenceTransformer. - # The context extension uses vllm style rope_theta and rope_scaling. + # The context extension uses vllm style rope_theta and rope_parameters. # See #17785 #18755 if ( not vllm_config.model_config.hf_overrides @@ -172,7 +175,7 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): if hasattr(hf_text_config, "max_model_len"): delattr(hf_text_config, "max_model_len") hf_text_config.max_position_embeddings = max_trained_positions - hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"] # The priority of sentence_bert_config.json is higher # than max_position_embeddings @@ -246,8 +249,7 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): "head_size": head_dim, "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, - "base": config.rope_theta, - "rope_scaling": getattr(config, "rope_scaling", None), + "rope_parameters": config.rope_parameters, } diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 528ef4f76742d..2c729019081a4 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -197,7 +197,10 @@ class DbrxAttention(nn.Module): self.head_dim = self.d_model // self.total_num_heads self.total_num_kv_heads = config.attn_config.kv_n_heads self.clip_qkv = config.attn_config.clip_qkv - self.rope_theta = config.attn_config.rope_theta + rope_parameters = { + "rope_type": "default", + "rope_theta": int(config.attn_config.rope_theta), + } self.max_position = config.max_seq_len # pylint: disable=invalid-name @@ -221,7 +224,7 @@ class DbrxAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, ) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index e8ee9951d6119..6675b2133f386 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -27,7 +27,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -111,8 +110,6 @@ class DeepseekAttention(nn.Module): config: DeepseekV2Config | DeepseekV3Config, hidden_size: int, num_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -139,7 +136,6 @@ class DeepseekAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -162,8 +158,7 @@ class DeepseekAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -409,8 +404,6 @@ class DeepseekV2Attention(nn.Module): v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -430,7 +423,6 @@ class DeepseekV2Attention(nn.Module): assert num_heads % tp_size == 0 self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings assert topk_indices_buffer is None, ( "topk_indices_buffer is not \ @@ -485,21 +477,20 @@ class DeepseekV2Attention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.o_proj", ) - if rope_scaling: - rope_scaling["rope_type"] = "deepseek_yarn" + if config.rope_parameters["rope_type"] != "default": + config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] + if config.rope_parameters["rope_type"] != "default": + mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) + scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale @@ -903,8 +894,6 @@ class DeepseekV2MLAAttention(nn.Module): v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -927,7 +916,6 @@ class DeepseekV2MLAAttention(nn.Module): self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: @@ -981,19 +969,18 @@ class DeepseekV2MLAAttention(nn.Module): prefix=f"{prefix}.o_proj", ) - if rope_scaling: - rope_scaling["rope_type"] = "deepseek_yarn" + if config.rope_parameters["rope_type"] != "default": + config.rope_parameters["rope_type"] = "deepseek_yarn" self.rotary_emb = get_rope( qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=False, ) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] + if config.rope_parameters["rope_type"] != "default": + mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False) + scaling_factor = config.rope_parameters["factor"] mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.scaling = self.scaling * mscale * mscale @@ -1073,8 +1060,6 @@ class DeepseekV2DecoderLayer(nn.Module): parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) moe_layer_freq = getattr(config, "moe_layer_freq", 1) # DecoderLayers are created with `make_layers` which passes the prefix @@ -1107,8 +1092,6 @@ class DeepseekV2DecoderLayer(nn.Module): v_head_dim=v_head_dim, q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, kv_lora_rank=kv_lora_rank, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index d24da0c42a254..e65c275106a4e 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -202,8 +201,6 @@ class Dots1Attention(nn.Module): num_heads: int, num_kv_heads: int, config: Dots1Config, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -229,7 +226,6 @@ class Dots1Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings attention_bias = config.attention_bias @@ -255,8 +251,7 @@ class Dots1Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -296,8 +291,6 @@ class Dots1DecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) self.layer_idx = layer_idx @@ -307,8 +300,6 @@ class Dots1DecoderLayer(nn.Module): num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, config=config, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index f2999968669f6..a7df3509e3ecd 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -62,6 +62,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP from .utils import ( @@ -232,9 +233,8 @@ class Ernie4_5_MoeAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], head_dim: int | None = None, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, qkv_bias: bool = False, @@ -266,7 +266,6 @@ class Ernie4_5_MoeAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -291,9 +290,8 @@ class Ernie4_5_MoeAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=rope_parameters, is_neox_style=False, - rope_scaling=rope_scaling, ) self.attn = Attention( self.num_heads, @@ -333,16 +331,14 @@ class Ernie4_5_MoeDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=500000) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) self.self_attn = Ernie4_5_MoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "use_bias", False), diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py index e8ef86f9b7f01..50e033d77606d 100644 --- a/vllm/model_executor/models/ernie45_vl_moe.py +++ b/vllm/model_executor/models/ernie45_vl_moe.py @@ -58,6 +58,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .ernie45_moe import Ernie4_5_MoeMLP from .interfaces import SupportsPP @@ -91,9 +92,8 @@ class Ernie4_5_VLMoeAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], head_dim: int | None = None, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, freq_allocation: int = 20, max_position_embeddings: int = 131072, rms_norm_eps: float = 1e-05, @@ -126,7 +126,6 @@ class Ernie4_5_VLMoeAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -155,7 +154,7 @@ class Ernie4_5_VLMoeAttention(nn.Module): head_size=self.head_dim, rotary_dim=self.head_dim, max_position_embeddings=max_position_embeddings, - base=rope_theta, + base=rope_parameters["rope_theta"], is_neox_style=False, dtype=torch.get_default_dtype(), mrope_section=[h_rope, w_rope, t_rope], @@ -413,8 +412,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 500000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=500000) freq_allocation = getattr(config, "freq_allocation", 20) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) @@ -423,8 +421,7 @@ class Ernie4_5_VLMoeDecoderLayer(nn.Module): num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, head_dim=getattr(config, "head_dim", None), - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, freq_allocation=freq_allocation, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 6c56bfc433c7a..d13275488fe99 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -27,7 +27,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -113,8 +112,6 @@ class ExaoneAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -144,7 +141,6 @@ class ExaoneAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -173,8 +169,7 @@ class ExaoneAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -207,8 +202,6 @@ class ExaoneBlockAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -221,8 +214,6 @@ class ExaoneBlockAttention(nn.Module): hidden_size=hidden_size, num_heads=num_heads, num_kv_heads=num_kv_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=bias, @@ -251,14 +242,6 @@ class ExaoneDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -272,8 +255,6 @@ class ExaoneDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index b89e168ada20e..70f3cce2b7c56 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -23,7 +23,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -52,6 +51,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsLoRA, SupportsPP from .utils import ( @@ -110,8 +110,6 @@ class Exaone4Attention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 1000000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -141,7 +139,6 @@ class Exaone4Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,12 +173,12 @@ class Exaone4Attention(nn.Module): # apply rotary embeddings to every layer in full attention models self.apply_rope_all_layers = "sliding_attention" not in config.layer_types + set_default_rope_theta(config, default_theta=1000000) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) self.attn = Attention( @@ -227,14 +224,6 @@ class Exaone4DecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -249,8 +238,6 @@ class Exaone4DecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 85acdff3d96b4..dc2d51f340c8c 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -164,13 +164,12 @@ class FalconAttention(nn.Module): ) if self.use_rotary: - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index b985847af5daf..9433f0d1b4a49 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -35,6 +35,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import ( HasInnerState, @@ -214,8 +215,7 @@ class FalconH1AttentionDecoderLayer(nn.Module): prefix: str = "", ) -> None: super().__init__() - rope_theta = getattr(config, "rope_theta", 1e11) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1e11) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -240,7 +240,6 @@ class FalconH1AttentionDecoderLayer(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): @@ -254,8 +253,7 @@ class FalconH1AttentionDecoderLayer(nn.Module): head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - rope_scaling=rope_scaling, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, dtype=None, # see impl of get_rope ) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 7aaae7c503b58..00c7f59a08094 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -20,6 +20,7 @@ from collections.abc import Iterable from functools import cache from itertools import islice +from typing import Any import torch from torch import nn @@ -127,8 +128,8 @@ class GemmaAttention(nn.Module): num_heads: int, num_kv_heads: int, head_dim: int, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -153,7 +154,6 @@ class GemmaAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -176,7 +176,7 @@ class GemmaAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -218,7 +218,7 @@ class GemmaDecoderLayer(nn.Module): num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 4d5d6cbb37c62..9b6cfe6932300 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -107,7 +107,6 @@ class Gemma2Attention(nn.Module): num_kv_heads: int, head_dim: int, max_position_embeddings: int, - rope_theta: float, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attn_logits_soft_cap: float | None = None, @@ -134,7 +133,6 @@ class Gemma2Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.query_pre_attn_scalar**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -156,7 +154,7 @@ class Gemma2Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=True, ) @@ -206,7 +204,6 @@ class Gemma2DecoderLayer(nn.Module): num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, max_position_embeddings=config.max_position_embeddings, - rope_theta=config.rope_theta, cache_config=cache_config, quant_config=quant_config, attn_logits_soft_cap=config.attn_logit_softcapping, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 357e61a4e78bf..565719ae7faeb 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -155,25 +155,28 @@ class Gemma3Attention(nn.Module): self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps) layer_idx = extract_layer_index(prefix) - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] + self.is_sliding = layer_type == "sliding_attention" sliding_window = config.sliding_window if self.is_sliding else None # Initialize the rotary embedding. - if self.is_sliding: - # Local attention. Override the values in config.json. - self.rope_theta = config.rope_local_base_freq - self.rope_scaling = {"rope_type": "default"} + if layer_type in config.rope_parameters: + # Transformers v5 rope config. + rope_parameters = config.rope_parameters[layer_type] else: + # Transformers v4 rope config. # Global attention. Use the values in config.json. - self.rope_theta = config.rope_theta - self.rope_scaling = config.rope_scaling + rope_parameters = config.rope_parameters.copy() + # Local attention. Override the values in config.json. + if self.is_sliding: + rope_parameters["rope_theta"] = config.rope_local_base_freq + self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=self.rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=self.rope_scaling, ) if getattr(config, "is_causal", True): diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 64443190f53ed..8f1447ba34a81 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -332,18 +332,21 @@ class Gemma3nAttention(nn.Module): ) layer_idx = extract_layer_index(prefix) - is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] + is_sliding = layer_type == "sliding_attention" self.sliding_window = config.sliding_window if is_sliding else None # Initialize the rotary embedding. - if is_sliding: - # Local attention. Override the values in config.json. - rope_theta = config.rope_local_base_freq - rope_scaling = {"rope_type": "default"} + if layer_type in config.rope_parameters: + # Transformers v5 rope config. + rope_parameters = config.rope_parameters[layer_type] else: + # Transformers v4 rope config. # Global attention. Use the values in config.json. - rope_theta = config.rope_theta - rope_scaling = config.rope_scaling + rope_parameters = config.rope_parameters.copy() + # Local attention. Override the values in config.json. + if is_sliding: + rope_parameters["rope_theta"] = config.rope_local_base_freq first_kv_shared_layer_idx = ( config.num_hidden_layers - config.num_kv_shared_layers @@ -383,9 +386,8 @@ class Gemma3nAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=rope_scaling, ) self.attn = Attention( diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index faa0674a2e43d..f8ef3b0385fb1 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -57,10 +57,8 @@ class Glm4Attention(nn.Module): max_position: int = 4096 * 32, head_dim: int | None = None, qkv_bias: bool = False, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -86,7 +84,6 @@ class Glm4Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, self.head_dim, @@ -107,8 +104,7 @@ class Glm4Attention(nn.Module): self.head_dim, rotary_dim=self.rotary_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=partial_rotary_factor, is_neox_style=False, ) @@ -150,8 +146,6 @@ class Glm4DecoderLayer(nn.Module): quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = Glm4Attention( config=config, @@ -159,12 +153,10 @@ class Glm4DecoderLayer(nn.Module): num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", attn_type=AttentionType.DECODER, ) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 7a4fee76ae6b3..6581bbda6d609 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -703,7 +703,6 @@ class Glm4vVisionTransformer(nn.Module): head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) self.blocks = nn.ModuleList( diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 1422dbe9b3cd0..5aa51af54a00b 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -26,7 +26,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -233,8 +232,6 @@ class Glm4MoeAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 131072, head_dim: int | None = None, rms_norm_eps: float = 1e-05, @@ -264,7 +261,6 @@ class Glm4MoeAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = use_qk_norm @@ -291,8 +287,7 @@ class Glm4MoeAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=partial_rotary_factor, ) self.attn = Attention( @@ -341,8 +336,6 @@ class Glm4MoeDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 131072) # DecoderLayers are created with `make_layers` which passes the prefix # with the layer's index. @@ -354,8 +347,6 @@ class Glm4MoeDecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, head_dim=config.head_dim, rms_norm_eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index e416ecde0c1e0..e94de8952fa63 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,13 +95,12 @@ class GPTJAttention(nn.Module): scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, rotary_dim=config.rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, is_neox_style=False, ) self.attn = Attention( diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index af0c9209231cb..815c2fba4d9fe 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,13 +92,12 @@ class GPTNeoXAttention(nn.Module): scaling = self.head_size**-0.5 rotary_dim = int(self.head_size * config.rotary_pct) assert rotary_dim % 2 == 0 - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 7df3b087ccb88..f310f71af92d9 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -67,16 +67,16 @@ class OAIAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, dtype=torch.float32, - rope_scaling={ + rope_parameters={ + "rope_theta": config.rope_parameters["rope_theta"], "rope_type": "yarn", - "factor": config.rope_scaling["factor"], - "original_max_position_embeddings": config.rope_scaling[ + "factor": config.rope_parameters["factor"], + "original_max_position_embeddings": config.rope_parameters[ "original_max_position_embeddings" ], - "beta_fast": config.rope_scaling["beta_fast"], - "beta_slow": config.rope_scaling["beta_slow"], + "beta_fast": config.rope_parameters["beta_fast"], + "beta_slow": config.rope_parameters["beta_slow"], }, is_neox_style=True, ) @@ -90,7 +90,6 @@ class OAIAttention(nn.Module): self.q_size = self.num_attention_heads * self.head_dim // tp_size self.kv_size = self.num_key_value_heads * self.head_dim // tp_size self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta self.qkv_proj = QKVParallelLinear( hidden_size=self.hidden_size, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c44b4021471ef..1dc205b47753d 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -112,8 +111,6 @@ class GraniteAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -143,7 +140,6 @@ class GraniteAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = config.attention_multiplier - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -167,8 +163,7 @@ class GraniteAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -204,14 +199,6 @@ class GraniteDecoderLayer(nn.Module): super().__init__() self.hidden_size = config.hidden_size self.residual_multiplier = config.residual_multiplier - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -225,8 +212,6 @@ class GraniteDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 5c6759ded0669..8f4139d63c3f6 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -141,8 +141,7 @@ class GraniteMoeAttention(nn.Module): num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, attention_multiplier: float | None = None, @@ -172,7 +171,6 @@ class GraniteMoeAttention(nn.Module): if attention_multiplier is not None else self.head_dim**-1 ) - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -194,9 +192,8 @@ class GraniteMoeAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=rope_scaling, ) self.attn = Attention( self.num_heads, @@ -235,16 +232,12 @@ class GraniteMoeDecoderLayer(nn.Module): parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index a340112ec62ae..9d5eeef198a61 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -273,10 +273,7 @@ class GraniteMoeHybridAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=int(config.rope_theta), - rope_scaling=config.rope_scaling - if hasattr(config, "rope_scaling") and config.rope_scaling is not None - else None, + rope_parameters=config.rope_parameters, is_neox_style=True, ) else: diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 926c539af33be..fd346db7e35aa 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -84,16 +84,12 @@ class GraniteMoeSharedDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = GraniteMoeAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 9dc231863f74f..4bf23cd6fd19a 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -25,6 +25,7 @@ from collections.abc import Iterable from itertools import islice +from typing import Any import torch import torch.nn.functional as F @@ -134,7 +135,7 @@ class Grok1Attention(nn.Module): num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -161,7 +162,6 @@ class Grok1Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -183,7 +183,7 @@ class Grok1Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, ) @@ -234,15 +234,12 @@ class Grok1DecoderLayer(nn.Module): if not self.use_fp8 and hasattr(quant_config, "is_fp8"): self.use_fp8 = quant_config.is_fp8 - # Requires transformers > 4.32.0 - # Default rope_theta value if not in config - rope_theta = 10000 self.attn = Grok1Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 1eadcbe67ade3..9fa5e2bd33f21 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -27,7 +27,6 @@ import typing from collections.abc import Callable, Iterable from itertools import islice -from typing import Any import regex as re import torch @@ -142,8 +141,6 @@ class HunYuanAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -177,7 +174,6 @@ class HunYuanAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = getattr(config, "use_qk_norm", False) self.layer_id = layer_id @@ -204,8 +200,7 @@ class HunYuanAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -254,8 +249,6 @@ class HunYuanCrossAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -289,7 +282,6 @@ class HunYuanCrossAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.use_qk_norm = getattr(config, "use_qk_norm", False) self.layer_id = layer_id @@ -314,8 +306,7 @@ class HunYuanCrossAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -494,14 +485,6 @@ class HunYuanDecoderLayer(nn.Module): if isinstance(config.intermediate_size, int) else config.intermediate_size[layer_id] ) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False @@ -520,8 +503,6 @@ class HunYuanDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, @@ -537,8 +518,6 @@ class HunYuanDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 60fbeb842dd4b..dc8f821bd134f 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -91,8 +91,7 @@ class InternLM2Attention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -120,7 +119,6 @@ class InternLM2Attention(nn.Module): self.kv_size = self.num_kv_heads * self.head_dim self.key_value_groups = int(self.num_heads / self.num_kv_heads) self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.wqkv = QKVParallelLinear( @@ -144,8 +142,7 @@ class InternLM2Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -204,15 +201,12 @@ class InternLMDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 6dc081e34157b..a57db82242af9 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -30,15 +30,12 @@ class InternLM2VEDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.attention = InternLM2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py index f3675075a48f4..4562b2202c5ec 100644 --- a/vllm/model_executor/models/kimi_linear.py +++ b/vllm/model_executor/models/kimi_linear.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -190,9 +189,7 @@ class KimiMLAAttention(nn.Module): v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, use_nope: bool = False, - rope_scaling: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -210,11 +207,9 @@ class KimiMLAAttention(nn.Module): tp_size = get_tensor_model_parallel_world_size() self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.use_nope = use_nope assert self.use_nope is True assert self.q_lora_rank is None - assert rope_scaling is None assert num_heads % tp_size == 0 self.kv_a_proj_with_mqa = ReplicatedLinear( self.hidden_size, diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index aeb25602f11a4..74bdde27ece5c 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from itertools import islice -from typing import Any import torch import torch.nn as nn @@ -96,8 +95,6 @@ class Lfm2Attention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -126,7 +123,6 @@ class Lfm2Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -149,8 +145,7 @@ class Lfm2Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -199,14 +194,6 @@ class Lfm2AttentionDecoderLayer(nn.Module): self.config = config self.layer_idx = layer_idx - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2Attention( @@ -215,8 +202,6 @@ class Lfm2AttentionDecoderLayer(nn.Module): hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index 6b7b5564ee989..c088a08211527 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from itertools import islice -from typing import Any import torch import torch.nn as nn @@ -189,8 +188,6 @@ class Lfm2MoeAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -219,7 +216,6 @@ class Lfm2MoeAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -242,8 +238,7 @@ class Lfm2MoeAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -293,14 +288,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module): self.config = config self.layer_idx = layer_idx - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = Lfm2MoeAttention( @@ -309,8 +296,6 @@ class Lfm2MoeAttentionDecoderLayer(nn.Module): hidden_size=config.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 0a3f37c30ab5f..d5b49d2fb4c26 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -120,8 +119,6 @@ class LlamaAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -157,7 +154,6 @@ class LlamaAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings llama_4_scaling_config = getattr(config, "llama_4_scaling", None) @@ -186,9 +182,7 @@ class LlamaAttention(nn.Module): prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) sliding_window = None if layer_types := getattr(config, "layer_types", None): @@ -258,7 +252,6 @@ class LlamaAttention(nn.Module): def _init_rotary_emb( self, config: LlamaConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -270,8 +263,7 @@ class LlamaAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -291,14 +283,6 @@ class LlamaDecoderLayer(nn.Module): quant_config = self.get_quant_config(vllm_config) self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -326,8 +310,6 @@ class LlamaDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index a7e0732ec71e2..4c6d1d4244755 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -19,7 +19,6 @@ """Inference-only LLaMA model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -171,8 +170,6 @@ class Llama4Attention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -208,7 +205,6 @@ class Llama4Attention(nn.Module): self.floor_scale = getattr(config, "floor_scale", 8192.0) self.attn_scale = getattr(config, "attn_scale", 0.1) - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.n_rep = self.num_heads // self.num_kv_heads self.qk_norm = ( @@ -248,8 +244,7 @@ class Llama4Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=int(rope_theta), - rope_scaling=rope_scaling if rope_scaling != "default" else None, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) if not self.nope @@ -331,8 +326,6 @@ class Llama4DecoderLayer(nn.Module): self.layer_idx = extract_layer_index(prefix) self.global_layer = config.no_rope_layers[self.layer_idx] == 0 self.hidden_size = config.hidden_size - rope_theta = config.rope_theta - rope_scaling = config.rope_scaling max_position_embeddings = config.max_position_embeddings self.self_attn = Llama4Attention( @@ -340,8 +333,6 @@ class Llama4DecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=False, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index 5de10e7086830..fafe97cd2be7e 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -108,8 +108,7 @@ class FlashConfig(PretrainedConfig): eos_token_id=100001, pretraining_tp=1, tie_word_embeddings=False, - rope_theta=1000000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, mla_scale_q_lora=False, @@ -162,8 +161,13 @@ class FlashConfig(PretrainedConfig): self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mla_scale_q_lora = mla_scale_q_lora @@ -336,15 +340,7 @@ class FlashDecoderLayer(nn.Module): super().__init__() self.layer_idx = int(prefix.split(sep=".")[-1]) self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) # Dual attention structure self.self_attn = nn.ModuleList( @@ -361,8 +357,6 @@ class FlashDecoderLayer(nn.Module): config.q_lora_rank if hasattr(config, "q_lora_rank") else None ), kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=None diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 914b097fe199e..04923833065f3 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -230,8 +230,7 @@ class MiniCPMAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -257,7 +256,6 @@ class MiniCPMAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -281,8 +279,7 @@ class MiniCPMAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( @@ -324,8 +321,6 @@ class MiniCPMDecoderLayer(nn.Module): self.cache_config = cache_config self.quant_config = quant_config self.hidden_size = config.hidden_size - self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_scaling = getattr(config, "rope_scaling", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -339,8 +334,7 @@ class MiniCPMDecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.config.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index d3b6966ee3a7f..2d775219fc972 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -25,8 +25,6 @@ # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any - import torch from torch import nn from transformers import PretrainedConfig @@ -62,8 +60,6 @@ class MiniCPM3Attention(nn.Module): v_head_dim: int, q_lora_rank: int, kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -84,7 +80,6 @@ class MiniCPM3Attention(nn.Module): self.num_local_heads = num_heads // tp_size self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.q_a_proj = ReplicatedLinear( @@ -127,8 +122,7 @@ class MiniCPM3Attention(nn.Module): self.qk_rope_head_dim, rotary_dim=self.qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_local_heads, @@ -204,8 +198,6 @@ class MiniCPM3DecoderLayer(MiniCPMDecoderLayer): v_head_dim=self.config.v_head_dim, q_lora_rank=self.config.q_lora_rank, kv_lora_rank=self.config.kv_lora_rank, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index d0cdb70aa8574..e6bccfcac4f1a 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -69,8 +69,6 @@ class EagleMiniCPMDecoderLayer(nn.Module): self.cache_config = cache_config self.quant_config = quant_config self.hidden_size = config.hidden_size - self.rope_theta = getattr(config, "rope_theta", 10000) - self.rope_scaling = getattr(config, "rope_scaling", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.prefix = prefix self._init_attn_block() @@ -84,8 +82,7 @@ class EagleMiniCPMDecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=self.config.num_attention_heads, num_kv_heads=self.config.num_key_value_heads, - rope_theta=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=self.config.rope_parameters, max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 49d2f2d261969..4955c68c0cda8 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -149,8 +149,7 @@ class MiniMaxM2Attention(nn.Module): num_heads: int, num_kv_heads: int, rotary_dim: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, attn_window_size: int | None = None, max_position_embeddings: int = 8192, head_dim: int | None = None, @@ -180,7 +179,6 @@ class MiniMaxM2Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -205,8 +203,7 @@ class MiniMaxM2Attention(nn.Module): self.head_dim, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -252,8 +249,6 @@ class MiniMaxM2DecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): max_position_embeddings = max( @@ -269,8 +264,7 @@ class MiniMaxM2DecoderLayer(nn.Module): num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, rotary_dim=config.rotary_dim, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index bf1ecc822756d..50f7396e2de60 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -188,7 +188,7 @@ class MiniMaxText01Attention(nn.Module): num_kv_heads: int, rotary_dim: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, + rope_parameters: dict | None = None, sliding_window: int | None = None, quant_config: QuantizationConfig | None = None, layer_idx: int = None, @@ -214,7 +214,6 @@ class MiniMaxText01Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.sliding_window = sliding_window self.prefix = prefix @@ -247,7 +246,7 @@ class MiniMaxText01Attention(nn.Module): head_size=self.head_dim, rotary_dim=rotary_dim, max_position=max_position, - base=int(rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, dtype=torch.float32, ) @@ -287,8 +286,6 @@ class MiniMaxText01DecoderLayer(nn.Module): self.hidden_size = config.hidden_size self.expert_num = expert_num - rope_theta = getattr(config, "rope_theta", 10000) - head_dim = getattr(config, "head_dim", None) if head_dim is None: head_dim = config.hidden_size // config.num_attention_heads @@ -328,7 +325,7 @@ class MiniMaxText01DecoderLayer(nn.Module): else head_dim, num_kv_heads=config.num_key_value_heads, max_position=max_position_embeddings, - rope_theta=rope_theta, + rope_parameters=config.rope_parameters, sliding_window=config.sliding_window, quant_config=quant_config, layer_idx=self._ilayer, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index d7a1cb82fb4fb..54ab8dd493e73 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -161,7 +161,6 @@ class MixtralAttention(nn.Module): num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -189,7 +188,6 @@ class MixtralAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -211,7 +209,7 @@ class MixtralAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( @@ -248,15 +246,12 @@ class MixtralDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = MixtralAttention( config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn", diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index e25a104d822a7..286859d188d34 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -292,13 +292,17 @@ class Llama4VisionAttention(nn.Module): prefix=f"{prefix}.o_proj", ) + rope_parameters = { + "rope_type": "mllama4", + "rope_theta": config.rope_parameters["rope_theta"], + } + self.rotary_emb = get_rope( head_size=self.head_dim, rotary_dim=config.hidden_size // config.num_attention_heads // 2, # number of image patches max_position=(config.image_size // config.patch_size) ** 2, - base=config.rope_theta, - rope_scaling={"rope_type": "mllama4"}, + rope_parameters=rope_parameters, is_neox_style=False, dtype=torch.complex64, # important ) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ab83a271e30a0..dc06938d5d6e1 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -410,7 +410,6 @@ class MolmoAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( @@ -437,7 +436,7 @@ class MolmoAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 92dcf5ea57008..c3337bd1ea699 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -150,8 +149,6 @@ class NemotronAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -181,7 +178,6 @@ class NemotronAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.partial_rotary_factor = config.partial_rotary_factor self.max_position_embeddings = max_position_embeddings @@ -206,8 +202,7 @@ class NemotronAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( @@ -243,14 +238,6 @@ class NemotronDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -264,8 +251,6 @@ class NemotronDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index b839206a3094d..2eebe38051cbd 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -26,7 +26,6 @@ from collections.abc import Iterable from itertools import islice -from typing import Any import torch from torch import nn @@ -82,8 +81,6 @@ class DeciLMAttention(LlamaAttention): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -97,8 +94,6 @@ class DeciLMAttention(LlamaAttention): hidden_size, num_heads, num_kv_heads, - rope_theta, - rope_scaling, max_position_embeddings, quant_config, bias, @@ -111,7 +106,6 @@ class DeciLMAttention(LlamaAttention): def _init_rotary_emb( self, config, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: # Enables YARN for Mistral and LLaMA4 derivatives. @@ -126,8 +120,7 @@ class DeciLMAttention(LlamaAttention): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, partial_rotary_factor=self.partial_rotary_factor, ) @@ -148,14 +141,6 @@ class DeciLMDecoderLayer(nn.Module): self._is_no_op_ffn = block_config.ffn.no_op self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -176,8 +161,6 @@ class DeciLMDecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=num_kv_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 487e3f671a455..bd8a8e317544f 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -87,7 +87,6 @@ class OlmoAttention(nn.Module): self.num_heads = self.total_num_heads // tensor_model_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.clip_qkv = config.clip_qkv # Attention input projection. Projects x -> (q, k, v) @@ -105,7 +104,7 @@ class OlmoAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, ) self.scaling = self.head_dim**-0.5 self.attn = Attention( diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 045582c889ee4..f0f6b2f6b3e6d 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -99,7 +99,6 @@ class Olmo2Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.max_position_embeddings = self.config.max_position_embeddings - self.rope_theta = self.config.rope_theta # Attention input projection. Projects x -> (q, k, v) self.qkv_proj = QKVParallelLinear( @@ -139,15 +138,17 @@ class Olmo2Attention(nn.Module): prefix=f"{prefix}.attn", ) - # Rotary embeddings. Rope scaling is only applied on full attention - # layers. - self.rope_scaling = self.config.rope_scaling if sliding_window is None else None + # Rotary embeddings. Rope scaling is only applied on full attention layers. + if sliding_window is None: + rope_parameters = self.config.rope_parameters + else: + rope_theta = self.config.rope_parameters["rope_theta"] + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.rotary_emb = get_rope( self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, # type: ignore - rope_scaling=self.rope_scaling, + rope_parameters=rope_parameters, ) # Attention output projection. diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 499eb05de76e4..c39e338d72e22 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -123,8 +123,6 @@ class OlmoeAttention(nn.Module): quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 4096) num_heads = config.num_attention_heads @@ -148,7 +146,6 @@ class OlmoeAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -176,8 +173,7 @@ class OlmoeAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index d13a745beffeb..f814cdfec5a22 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -77,6 +77,7 @@ from vllm.model_executor.models.utils import ( sequence_parallel_chunk, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta def check_ffn_act_fn(act_fn: str): @@ -259,7 +260,6 @@ class OpenPanguMLAAttention(nn.Module): v_head_dim: int, q_lora_rank: int | None, kv_lora_rank: int, - rope_theta: float = 10000, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -274,8 +274,6 @@ class OpenPanguMLAAttention(nn.Module): self.v_head_dim = v_head_dim self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank - self.rope_theta = rope_theta - self.tp_size = get_tensor_model_parallel_world_size() if num_heads % self.tp_size != 0: raise ValueError( @@ -339,7 +337,9 @@ class OpenPanguMLAAttention(nn.Module): ) # TODO: remove hard coding - rope_scaling = { + set_default_rope_theta(config, default_theta=10000) + rope_parameters = { + "rope_theta": config.rope_parameters["rope_theta"], "beta_fast": 32, "beta_slow": 1, "factor": 1, @@ -353,8 +353,7 @@ class OpenPanguMLAAttention(nn.Module): qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, is_neox_style=False, ) @@ -407,8 +406,6 @@ class OpenPanguEmbeddedAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -454,7 +451,6 @@ class OpenPanguEmbeddedAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -475,9 +471,7 @@ class OpenPanguEmbeddedAttention(nn.Module): prefix=f"{prefix}.o_proj", ) - self._init_rotary_emb( - config, rope_scaling=rope_scaling, quant_config=quant_config - ) + self._init_rotary_emb(config, quant_config=quant_config) if hasattr(config, "interleaved_sliding_window"): interleaved_sliding_window = config.interleaved_sliding_window @@ -521,7 +515,6 @@ class OpenPanguEmbeddedAttention(nn.Module): def _init_rotary_emb( self, config: PretrainedConfig, - rope_scaling: dict[str, Any] | None, quant_config: QuantizationConfig | None, ) -> None: is_neox_style = True @@ -533,8 +526,7 @@ class OpenPanguEmbeddedAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, ) @@ -555,7 +547,6 @@ class OpenPanguDecoderLayer(nn.Module): parallel_config = vllm_config.parallel_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) layer_idx = int(prefix.split(sep=".")[-1]) @@ -579,7 +570,6 @@ class OpenPanguDecoderLayer(nn.Module): config.q_lora_rank if hasattr(config, "q_lora_rank") else None ), kv_lora_rank=config.kv_lora_rank, - rope_theta=rope_theta, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, @@ -607,8 +597,6 @@ class OpenPanguDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=getattr(config, "rope_scaling", None), max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 859cd2cecf897..b30be93ca726f 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -88,8 +88,7 @@ class OrionAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -115,7 +114,6 @@ class OrionAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -139,8 +137,7 @@ class OrionAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -175,15 +172,12 @@ class OrionDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.self_attn = OrionAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 9db6c317c26a8..63d2fff6ec8bc 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -112,10 +112,8 @@ class OuroAttention(nn.Module): num_heads: int, num_kv_heads: int, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -140,7 +138,6 @@ class OuroAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config # Get total_ut_steps from config, default to 4 if not specified @@ -170,8 +167,7 @@ class OuroAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = nn.ModuleList() @@ -226,9 +222,6 @@ class OuroDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -244,10 +237,8 @@ class OuroDecoderLayer(nn.Module): num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3bf6a1d9763d0..98963d52e4848 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -106,7 +106,6 @@ class PersimmonAttention(nn.Module): self.num_heads = self.total_num_heads // tensor_parallel_world_size self.head_dim = self.hidden_size // self.total_num_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.partial_rotary_factor = config.partial_rotary_factor self.is_causal = True @@ -138,7 +137,7 @@ class PersimmonAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=self.rope_theta, + rope_parameters=config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.scaling = self.head_dim**-0.5 diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 8fee53c23fb4b..da476f621627b 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -115,16 +115,12 @@ class PhiAttention(nn.Module): ) assert rotary_dim % 2 == 0 - # pylint: disable=C0301 - # Refer to: - # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 - rope_theta = getattr(config, "rope_theta", 10000.0) max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, max_position=max_position_embeddings, - base=rope_theta, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 92fd858b608bc..8ffac95d93960 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -86,7 +86,7 @@ class PhiMoEConfig(PretrainedConfig): bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=1e6, + rope_parameters=None, sliding_window=None, attention_dropout=0.0, num_experts_per_tok=2, @@ -119,7 +119,9 @@ class PhiMoEConfig(PretrainedConfig): self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + if rope_parameters is None: + rope_theta = kwargs.pop("rope_theta", 1e6) + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -302,12 +304,11 @@ class PhiMoEAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict, head_dim: int | None = None, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: dict | None = None, prefix: str = "", ) -> None: super().__init__() @@ -332,8 +333,6 @@ class PhiMoEAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.qkv_proj = QKVParallelLinear( hidden_size, @@ -355,9 +354,8 @@ class PhiMoEAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=int(self.rope_theta), + rope_parameters=rope_parameters, is_neox_style=True, - rope_scaling=self.rope_scaling, ) self.attn = Attention( self.num_heads, @@ -393,7 +391,6 @@ class PhiMoEDecoderLayer(nn.Module): super().__init__() self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 10000) self.self_attn = PhiMoEAttention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -402,10 +399,9 @@ class PhiMoEDecoderLayer(nn.Module): head_dim=getattr( config, "head_dim", self.hidden_size // config.num_attention_heads ), - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) self.block_sparse_moe = PhiMoE( diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 52c9755e0e0ea..22f9c87fc905b 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -567,10 +567,6 @@ class Plamo2AttentionMixer(nn.Module): prefix=f"{prefix}.o_proj", ) - self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000 - self.rope_scaling = ( - config.rope_scaling if hasattr(config, "rope_scaling") else None - ) max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( vllm_config.model_config.max_model_len, int @@ -581,8 +577,7 @@ class Plamo2AttentionMixer(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=config.rope_parameters, ) self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps) self.q_norm.weight = torch.nn.Parameter( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 50a125c3f5973..c973e79170982 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -83,8 +83,7 @@ class QWenAttention(nn.Module): hidden_size: int, num_heads: int, max_position_embeddings: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", @@ -117,8 +116,7 @@ class QWenAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -153,14 +151,11 @@ class QWenBlock(nn.Module): super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) self.attn = QWenAttention( config.hidden_size, config.num_attention_heads, config.max_position_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, cache_config=cache_config, quant_config=quant_config, prefix=f"{prefix}.attn", diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 1bbb969ce5aa3..32b6d6dd07b83 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -57,7 +57,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.config import is_interleaved +from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .utils import ( @@ -114,11 +114,10 @@ class Qwen2Attention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict[str, Any], max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -143,7 +142,6 @@ class Qwen2Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( @@ -167,8 +165,7 @@ class Qwen2Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) attn_cls = ( @@ -216,9 +213,7 @@ class Qwen2DecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -237,10 +232,9 @@ class Qwen2DecoderLayer(nn.Module): num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 5b5d50ec8935a..8e3c0e84dfe51 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -641,7 +641,6 @@ class Qwen2_5_VisionTransformer(nn.Module): head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 2ff0d19df238c..6b97d0b2ca2e3 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -194,8 +194,7 @@ class Qwen2MoeAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 8192, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, @@ -222,7 +221,6 @@ class Qwen2MoeAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config @@ -248,8 +246,7 @@ class Qwen2MoeAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -291,8 +288,6 @@ class Qwen2MoeDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -301,8 +296,7 @@ class Qwen2MoeDecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index cda8eaf5377f1..d25ff2785bfef 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -643,7 +643,6 @@ class Qwen2VisionTransformer(nn.Module): head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 8d7f22a33fe6c..93a629d81e8ff 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -42,6 +42,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP from .qwen2 import Qwen2MLP as Qwen3MLP @@ -57,14 +58,13 @@ class Qwen3Attention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, + rope_parameters: dict, max_position: int = 4096 * 32, head_dim: int | None = None, rms_norm_eps: float = 1e-06, qkv_bias: bool = False, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: dict[str, Any] | None = None, @@ -89,7 +89,6 @@ class Qwen3Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.dual_chunk_attention_config = dual_chunk_attention_config self.qkv_proj = QKVParallelLinear( @@ -113,8 +112,7 @@ class Qwen3Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -166,9 +164,7 @@ class Qwen3DecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None ) @@ -187,13 +183,12 @@ class Qwen3DecoderLayer(nn.Module): num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), head_dim=getattr(config, "head_dim", None), cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 96751fee800bb..8ee3dd99e11db 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -216,8 +216,7 @@ class Qwen3MoeAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any], max_position_embeddings: int = 8192, head_dim: int | None = None, rms_norm_eps: float = 1e-06, @@ -247,7 +246,6 @@ class Qwen3MoeAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.dual_chunk_attention_config = dual_chunk_attention_config @@ -273,8 +271,7 @@ class Qwen3MoeAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, ) self.attn = Attention( @@ -326,8 +323,6 @@ class Qwen3MoeDecoderLayer(nn.Module): quant_config = vllm_config.quant_config self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) dual_chunk_attention_config = getattr( config, "dual_chunk_attention_config", None @@ -336,8 +331,7 @@ class Qwen3MoeDecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, num_kv_heads=config.num_key_value_heads, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, max_position_embeddings=max_position_embeddings, rms_norm_eps=config.rms_norm_eps, qkv_bias=getattr(config, "attention_bias", False), diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index ad631f61e4b93..bfed64728305e 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -748,8 +748,7 @@ class Qwen3NextAttention(nn.Module): head_size=self.head_dim, rotary_dim=self.head_dim, max_position=config.max_position_embeddings, - base=config.rope_theta, - rope_scaling=config.rope_scaling, + rope_parameters=config.rope_parameters, partial_rotary_factor=config.partial_rotary_factor, dual_chunk_attention_config=self.dual_chunk_attention_config, ) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index d2fd74a5e41ad..54ef56f83344e 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -338,7 +338,6 @@ class Qwen3Omni_VisionTransformer(nn.Module): head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 0c546309400b7..c10aeaec5ab83 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -345,7 +345,6 @@ class Qwen3_VisionTransformer(nn.Module): head_size=head_dim, rotary_dim=head_dim // 2, max_position=8192, - base=10000.0, is_neox_style=True, ) diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index bf211d28f1844..4744d8e44f390 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -54,6 +54,7 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.config import set_default_rope_theta from .interfaces import SupportsLoRA, SupportsPP from .utils import ( @@ -112,11 +113,10 @@ class SeedOssAttention(nn.Module): num_heads: int, num_kv_heads: int, head_dim: int, + rope_parameters: dict, max_position: int = 4096 * 32, - rope_theta: float = 10000, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, - rope_scaling: tuple | None = None, prefix: str = "", attn_type: str = AttentionType.DECODER, ) -> None: @@ -140,7 +140,6 @@ class SeedOssAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.qkv_proj = QKVParallelLinear( hidden_size, @@ -163,8 +162,7 @@ class SeedOssAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) self.attn = Attention( self.num_heads, @@ -200,9 +198,7 @@ class SeedOssDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - # Requires transformers > 4.32.0 - rope_theta = getattr(config, "rope_theta", 1000000) - rope_scaling = getattr(config, "rope_scaling", None) + set_default_rope_theta(config, default_theta=1000000) # By default, SeedOss uses causal attention as it is a # decoder-only model. @@ -219,10 +215,9 @@ class SeedOssDecoderLayer(nn.Module): max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, - rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", attn_type=attn_type, ) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 4ec855f794446..7e9fc51036d2e 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -25,7 +25,6 @@ """Inference-only Solar model compatible with HuggingFace weights.""" from collections.abc import Iterable -from typing import Any import torch from torch import nn @@ -111,8 +110,6 @@ class SolarAttention(nn.Module): hidden_size: int, num_heads: int, num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: dict[str, Any] | None = None, max_position_embeddings: int = 8192, quant_config: QuantizationConfig | None = None, bias: bool = False, @@ -142,7 +139,6 @@ class SolarAttention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.qkv_proj = QKVParallelLinear( @@ -166,8 +162,7 @@ class SolarAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, ) self.attn = Attention( self.num_heads, @@ -202,15 +197,6 @@ class SolarDecoderLayer(nn.Module): ) -> None: super().__init__() self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - - if rope_scaling is not None and getattr( - config, "original_max_position_embeddings", None - ): - rope_scaling["original_max_position_embeddings"] = ( - config.original_max_position_embeddings - ) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) # Support abacusai/Smaug-72B-v0.1 with attention_bias # Support internlm/internlm-7b with bias @@ -224,8 +210,6 @@ class SolarDecoderLayer(nn.Module): num_kv_heads=getattr( config, "num_key_value_heads", config.num_attention_heads ), - rope_theta=rope_theta, - rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 06eb7201c1a89..a738fcbb4ee28 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -153,7 +153,7 @@ class StablelmAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, - base=self.config.rope_theta, + rope_parameters=self.config.rope_parameters, partial_rotary_factor=self.partial_rotary_factor, ) self.attn = Attention( diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 0f2942acd5006..1118fca3cac91 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -91,7 +91,6 @@ class Starcoder2Attention(nn.Module): self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta self.max_position_embeddings = config.max_position_embeddings self.use_bias = config.use_bias @@ -115,7 +114,7 @@ class Starcoder2Attention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=self.max_position_embeddings, - base=int(self.rope_theta), + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.attn = Attention( diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 4fff356b29e28..3c377a2c539df 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.step3_vl import Step3TextConfig from .interfaces import SupportsPP from .utils import ( @@ -144,9 +145,8 @@ class Step3TextAttention(nn.Module): num_heads: int, num_kv_heads: int, norm_eps: float, - rope_theta: int, + rope_parameters: dict[str, Any], share_q_dim: int | None = None, - rope_scaling: dict[str, Any] | None = None, max_position_embedding: int = 8192, head_dim: int = 256, cache_config: CacheConfig | None = None, @@ -198,8 +198,7 @@ class Step3TextAttention(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position_embedding, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, ) scaling = self.head_dim**-0.5 self.attn = Attention( @@ -227,15 +226,13 @@ class Step3TextAttention(nn.Module): class Step3TextDecoderLayer(nn.Module): def __init__( self, - config: ModelConfig, + config: Step3TextConfig, cache_config: CacheConfig | None = None, quant_config: QuantizationConfig | None = None, prefix: str = "", ) -> None: super().__init__() - config = config.hf_config self.hidden_size = config.hidden_size - rope_scaling = getattr(config, "rope_scaling", None) self.self_attn = Step3TextAttention( hidden_size=self.hidden_size, @@ -247,8 +244,7 @@ class Step3TextDecoderLayer(nn.Module): max_position_embedding=config.max_position_embedding, head_dim=config.head_dim, share_q_dim=config.share_q_dim, - rope_theta=config.rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, prefix=f"{prefix}.self_attn", ) @@ -338,7 +334,7 @@ class Step3TextModel(nn.Module): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Step3TextDecoderLayer( - config=vllm_config.model_config, + config=config, cache_config=cache_config, quant_config=quant_config, prefix=prefix, diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py index 517eb54d53ac6..b807f45b5d52b 100644 --- a/vllm/model_executor/models/transformers/utils.py +++ b/vllm/model_executor/models/transformers/utils.py @@ -22,6 +22,7 @@ from typing import TYPE_CHECKING, Literal import torch from torch import nn +from transformers.configuration_utils import ALLOWED_LAYER_TYPES from vllm.config.utils import getattr_iter from vllm.logger import init_logger @@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool: """ text_config = vllm_config.model_config.hf_config.get_text_config() # Dynamic rope scaling is not compatible with torch.compile - rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {} - return rope_scaling.get("rope_type") != "dynamic" + rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {} + if rope_parameters: + # Nest rope_parameters if not nested already to simplify logic + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters = {"": rope_parameters} + return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values()) + return True diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 729a9655d0879..653b5b9beef7b 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -128,7 +128,6 @@ class Zamba2Attention(nn.Module): tp_size = get_tensor_model_parallel_world_size() self.config = config self.num_hybrid_layers = num_hybrid_layers - self.rope_theta = config.rope_theta self.attention_hidden_size = config.attention_hidden_size self.total_num_attention_heads = config.num_attention_heads @@ -233,8 +232,7 @@ class Zamba2Attention(nn.Module): head_size=self.attention_head_dim, rotary_dim=self.attention_head_dim, max_position=config.max_position_embeddings, - base=self.rope_theta, - rope_scaling=None, + rope_parameters=config.rope_parameters, is_neox_style=True, ) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index ac4a71648cec8..4ca155af03dca 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -7,8 +7,9 @@ import time from collections.abc import Callable from dataclasses import asdict from functools import cache, partial +from importlib.metadata import version from pathlib import Path -from typing import Any, Literal, TypeVar +from typing import Any, Literal, TypeAlias, TypeVar import huggingface_hub from huggingface_hub import ( @@ -24,7 +25,9 @@ from huggingface_hub.utils import ( RepositoryNotFoundError, RevisionNotFoundError, ) +from packaging.version import Version from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig +from transformers.configuration_utils import ALLOWED_LAYER_TYPES from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, @@ -390,21 +393,61 @@ def file_or_path_exists( ) -def patch_rope_scaling(config: PretrainedConfig) -> None: +def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None: + """Some models may have no rope_theta in their config but still use RoPE. + This function sets a default rope_theta if it's missing.""" + if getattr(config, "rope_parameters", None) is None: + config.rope_parameters = {"rope_type": "default"} + if "rope_theta" not in config.rope_parameters: + config.rope_parameters["rope_theta"] = default_theta + + +def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" - text_config = getattr(config, "text_config", None) - if text_config is not None: - patch_rope_scaling(text_config) + # Retrieve rope_parameters differently based on Transformers version + if Version(version("transformers")) >= Version("5.0.0.dev0"): + from transformers.modeling_rope_utils import RopeParameters - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is not None: - patch_rope_scaling_dict(rope_scaling) + rope_parameters: RopeParameters | dict[str, RopeParameters] | None = getattr( + config, "rope_parameters", None + ) + elif hasattr(config, "rope_parameters"): + # We are in Transformers v4 and rope_parameters + # has already been patched for this config + return + else: + # Convert Transformers v4 rope_theta and rope_scaling into rope_parameters + rope_theta: float | None = getattr(config, "rope_theta", None) + rope_scaling: dict | None = getattr(config, "rope_scaling", None) + rope_parameters = rope_scaling + # Move rope_theta into rope_parameters + if rope_theta is not None: + rope_parameters = rope_parameters or {"rope_type": "default"} + rope_parameters["rope_theta"] = rope_theta + # Add original_max_position_embeddings if present + if rope_parameters and ( + ompe := getattr(config, "original_max_position_embeddings", None) + ): + rope_parameters["original_max_position_embeddings"] = ompe + # Write back to config + config.rope_parameters = rope_parameters + + # No RoPE parameters to patch + if rope_parameters is None: + return + + # Handle nested rope_parameters in interleaved sliding attention models + if set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + for rope_parameters_layer_type in rope_parameters.values(): + patch_rope_parameters_dict(rope_parameters_layer_type) + else: + patch_rope_parameters_dict(rope_parameters) -def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: - if "rope_type" in rope_scaling and "type" in rope_scaling: - rope_type = rope_scaling["rope_type"] - rope_type_legacy = rope_scaling["type"] +def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None: + if "rope_type" in rope_parameters and "type" in rope_parameters: + rope_type = rope_parameters["rope_type"] + rope_type_legacy = rope_parameters["type"] if rope_type != rope_type_legacy: raise ValueError( f"Found conflicts between 'rope_type={rope_type}' (modern " @@ -412,28 +455,28 @@ def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None: "You should only specify one of them." ) - if "rope_type" not in rope_scaling and "type" in rope_scaling: - rope_scaling["rope_type"] = rope_scaling["type"] + if "rope_type" not in rope_parameters and "type" in rope_parameters: + rope_parameters["rope_type"] = rope_parameters["type"] logger.info("Replacing legacy 'type' key with 'rope_type'") - if "rope_type" not in rope_scaling: - raise ValueError("rope_scaling should have a 'rope_type' key") + if "rope_type" not in rope_parameters: + raise ValueError("rope_parameters should have a 'rope_type' key") - if rope_scaling["rope_type"] == "su": - rope_scaling["rope_type"] = "longrope" + if rope_parameters["rope_type"] == "su": + rope_parameters["rope_type"] = "longrope" logger.warning("Replacing legacy rope_type 'su' with 'longrope'") - elif rope_scaling["rope_type"] == "mrope": - assert "mrope_section" in rope_scaling - rope_scaling["rope_type"] = "default" + elif rope_parameters["rope_type"] == "mrope": + assert "mrope_section" in rope_parameters + rope_parameters["rope_type"] = "default" logger.warning("Replacing legacy rope_type 'mrope' with 'default'") def _uses_mrope(config: PretrainedConfig) -> bool: - rope_scaling = getattr(config, "rope_scaling", None) - if rope_scaling is None: + rope_parameters = getattr(config, "rope_parameters", None) + if rope_parameters is None: return False - return "mrope_section" in rope_scaling + return "mrope_section" in rope_parameters def uses_mrope(config: PretrainedConfig) -> bool: @@ -690,7 +733,14 @@ def get_config( logger.debug("Overriding HF config with %s", hf_overrides_fn) config = hf_overrides_fn(config) - patch_rope_scaling(config) + # Exhaustively patch RoPE parameters everywhere they might be + patch_rope_parameters(config) + patch_rope_parameters(config.get_text_config()) + SubConfigs: TypeAlias = dict[str, PretrainedConfig] + sub_configs: SubConfigs | None = getattr(config, "sub_configs", None) + if sub_configs: + for sub_config in sub_configs: + patch_rope_parameters(getattr(config, sub_config)) if trust_remote_code: maybe_register_config_serialize_by_value() diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py index 9b634fd037a33..47fee9882f9fc 100644 --- a/vllm/transformers_utils/configs/afmoe.py +++ b/vllm/transformers_utils/configs/afmoe.py @@ -24,7 +24,7 @@ class AfmoeConfig(PretrainedConfig): rms_norm_eps: float = 1e-5, use_cache: bool = True, tie_word_embeddings: bool = False, - rope_theta: float = 10000.0, + rope_parameters: dict | None = None, rope_scaling: dict | None = None, num_experts: int = 64, num_experts_per_tok: int = 6, @@ -56,7 +56,10 @@ class AfmoeConfig(PretrainedConfig): self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 10000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.rope_scaling = rope_scaling self.moe_intermediate_size = moe_intermediate_size diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 1707e15285c89..ba4b1a8f701f0 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -132,7 +139,7 @@ class ArcticConfig(PretrainedConfig): bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=1e6, + rope_parameters: dict[str, Any] | None = None, sliding_window=None, attention_dropout=0.0, num_experts_per_tok=1, @@ -165,7 +172,10 @@ class ArcticConfig(PretrainedConfig): self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 1e6) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py index 1f2f4d446288b..c343dc0999a87 100644 --- a/vllm/transformers_utils/configs/flex_olmo.py +++ b/vllm/transformers_utils/configs/flex_olmo.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any from transformers.configuration_utils import PretrainedConfig @@ -25,8 +26,7 @@ class FlexOlmoConfig(PretrainedConfig): bos_token_id=None, eos_token_id=100257, tie_word_embeddings=False, - rope_theta=500000.0, - rope_scaling=None, + rope_parameters: dict[str, Any] | None = None, attention_bias=False, attention_dropout=0.0, num_experts_per_tok=5, @@ -62,8 +62,13 @@ class FlexOlmoConfig(PretrainedConfig): self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -73,5 +78,5 @@ class FlexOlmoConfig(PretrainedConfig): self.norm_topk_prob = norm_topk_prob # Validate the correctness of rotary position embeddings parameters # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + if self.rope_parameters is not None and "type" in self.rope_parameters: + self.rope_parameters["rope_type"] = self.rope_parameters["type"] diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py index 65ddf48c5249b..14894816801d1 100644 --- a/vllm/transformers_utils/configs/kimi_linear.py +++ b/vllm/transformers_utils/configs/kimi_linear.py @@ -29,8 +29,7 @@ class KimiLinearConfig(PretrainedConfig): pad_token_id=0, bos_token_id=1, eos_token_id=2, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, tie_word_embeddings=False, moe_intermediate_size: int | None = None, moe_renormalize: bool = True, @@ -73,8 +72,13 @@ class KimiLinearConfig(PretrainedConfig): self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.q_lora_rank = q_lora_rank self.kv_lora_rank = kv_lora_rank diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py index 37c038e12db80..b399a03c030f0 100644 --- a/vllm/transformers_utils/configs/lfm2_moe.py +++ b/vllm/transformers_utils/configs/lfm2_moe.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any from transformers.configuration_utils import PretrainedConfig @@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + The parameters of the RoPE embeddings. max_position_embeddings (`int`, *optional*, defaults to 128000): The maximum sequence length that this model might ever be used with. use_cache (`bool`, *optional*, defaults to `True`): @@ -100,7 +101,7 @@ class Lfm2MoeConfig(PretrainedConfig): bos_token_id: int = 1, eos_token_id: int = 2, tie_word_embeddings: bool = True, - rope_theta: float = 1000000.0, + rope_parameters: dict[str, Any] | None = None, max_position_embeddings: int = 128_000, use_cache: bool = True, norm_eps: float = 0.00001, @@ -121,7 +122,10 @@ class Lfm2MoeConfig(PretrainedConfig): self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers - self.rope_theta = rope_theta + rope_theta = kwargs.pop("rope_theta", 1000000.0) + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + self.rope_parameters = rope_parameters self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py index e49bd26b2b00c..f1bbd057103e4 100644 --- a/vllm/transformers_utils/configs/midashenglm.py +++ b/vllm/transformers_utils/configs/midashenglm.py @@ -98,6 +98,6 @@ class MiDashengLMConfig(PretrainedConfig): if text_config else Qwen2_5OmniTextConfig() ) - self.text_config.rope_scaling = None # uses_mrope is false + self.text_config.rope_parameters = None # uses_mrope is false self.audio_token_id = audio_token_id super().__init__(**kwargs) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index c6f04febe37e1..8f72f0b28b0de 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -86,13 +86,13 @@ def _remap_mistral_yarn_args(config: dict) -> dict: "apply_scale": "apply_yarn_scaling", } yarn_config = config.get("yarn") or {} - config["rope_scaling"] = { + config["rope_parameters"] = { "rope_type": "yarn", "mscale_all_dim": 1, } for old_name, new_name in yarn_config_map.items(): if old_name in yarn_config: - config["rope_scaling"][new_name] = yarn_config.pop(old_name) + config["rope_parameters"][new_name] = yarn_config.pop(old_name) assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}" diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 60eed549561fb..d112c71d7d20b 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -88,8 +88,8 @@ class NemotronConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`dict`, *optional*): + The parameters of the RoPE embeddings. partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): @@ -132,8 +132,7 @@ class NemotronConfig(PretrainedConfig): bos_token_id=2, eos_token_id=3, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, partial_rotary_factor=0.5, attention_bias=False, attention_dropout=0.0, @@ -160,8 +159,13 @@ class NemotronConfig(PretrainedConfig): self.initializer_range = initializer_range self.norm_eps = norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters # for backward compatibility partial_rotary_factor = ( kwargs.get("rope_percent") @@ -169,7 +173,7 @@ class NemotronConfig(PretrainedConfig): or partial_rotary_factor ) self.partial_rotary_factor = partial_rotary_factor - self._rope_scaling_validation() + self._rope_parameters_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias @@ -182,31 +186,29 @@ class NemotronConfig(PretrainedConfig): **kwargs, ) - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: + if self.rope_parameters is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + rope_type: str | None = self.rope_parameters.get("rope_type", None) + factor: float | None = self.rope_parameters.get("factor", None) + + if rope_type not in {"default", "linear", "dynamic"}: raise ValueError( - "`rope_scaling` must be a dictionary with two fields, " - f"`type` and `factor`, got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: - raise ValueError( - "`rope_scaling`'s type field must be one of ['linear', " - f"'dynamic'], got {rope_scaling_type}" - ) - if ( - rope_scaling_factor is None - or not isinstance(rope_scaling_factor, float) - or rope_scaling_factor <= 1.0 - ): - raise ValueError( - "`rope_scaling`'s factor field must be a float > 1, got " - f"{rope_scaling_factor}" + "`rope_type` must be one of ['default', 'linear', 'dynamic'], " + f"got {rope_type}" ) + if rope_type != "default": + if factor is None: + raise ValueError( + "If `rope_type` is not 'default', `rope_parameters` " + "must include a `factor` field. Got `None`." + ) + if not isinstance(factor, float) or factor <= 1.0: + raise ValueError( + "`rope_parameters`'s factor field must be a float > 1, got " + f"{factor}" + ) diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py index f5a9a7cd36bdb..c4691b661af39 100644 --- a/vllm/transformers_utils/configs/olmo3.py +++ b/vllm/transformers_utils/configs/olmo3.py @@ -24,8 +24,7 @@ class Olmo3Config(PretrainedConfig): bos_token_id=None, eos_token_id=50279, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, attention_bias=False, attention_dropout=0.0, rms_norm_eps=1e-5, @@ -63,8 +62,13 @@ class Olmo3Config(PretrainedConfig): self.hidden_act = hidden_act self.initializer_range = initializer_range self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py index 21750bde2f878..d2fe58d48da6f 100644 --- a/vllm/transformers_utils/configs/qwen3_next.py +++ b/vllm/transformers_utils/configs/qwen3_next.py @@ -66,13 +66,12 @@ class Qwen3NextConfig(PretrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): + rope_parameters (`dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. Expected contents: + `rope_theta` (`float`): The base period of the RoPE embeddings. `rope_type` (`str`): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. @@ -199,8 +198,7 @@ class Qwen3NextConfig(PretrainedConfig): rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, partial_rotary_factor=0.25, attention_bias=False, attention_dropout=0.0, @@ -236,8 +234,13 @@ class Qwen3NextConfig(PretrainedConfig): self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py index 637b82d88e265..0ee650a70451f 100644 --- a/vllm/transformers_utils/configs/step3_vl.py +++ b/vllm/transformers_utils/configs/step3_vl.py @@ -52,8 +52,7 @@ class Step3TextConfig(PretrainedConfig): moe_intermediate_size: int = 5120, moe_num_experts: int = 48, moe_top_k: int = 3, - rope_theta: float = 500000, - rope_scaling: dict[str, Any] | None = None, + rope_parameters: dict[str, Any] | None = None, max_position_embedding: int = 65536, share_expert_dim: int = 5120, share_q_dim: int = 2048, @@ -130,8 +129,13 @@ class Step3TextConfig(PretrainedConfig): self.moe_intermediate_size = moe_intermediate_size self.moe_num_experts = moe_num_experts self.moe_top_k = moe_top_k - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} + rope_theta = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in rope_parameters: + rope_parameters["rope_theta"] = rope_theta + self.rope_parameters = rope_parameters self.max_position_embedding = max_position_embedding self.share_expert_dim = share_expert_dim self.share_q_dim = share_q_dim From 0c80efd94fb8c17cfc7d1bcb9cdb65f154340994 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Thu, 20 Nov 2025 01:32:55 +0800 Subject: [PATCH 024/249] GLM-V video segmentation solution adjustment (#28941) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> --- vllm/model_executor/models/glm4_1v.py | 94 +++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 6581bbda6d609..d141e95498064 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -37,7 +37,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from transformers import BatchFeature +from transformers import BatchFeature, Glm4vProcessor from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig from transformers.models.glm4v.image_processing_glm4v import ( Glm4vImageProcessor, @@ -1028,7 +1028,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo): return max(max_frames_per_video, 1) - def _get_video_second_idx( + def _get_video_second_idx_glm4v( self, metadata: dict[str, Any], total_frames: int ) -> list[int]: video_processor = self.get_video_processor() @@ -1079,6 +1079,83 @@ class Glm4vProcessingInfo(BaseProcessingInfo): selected_timestamps.append(timestamps_list[idx]) return selected_timestamps + def _get_video_second_idx_glm46v( + self, metadata: dict[str, Any], total_frames: int + ) -> list[int]: + video_processor = self.get_video_processor() + + video_fps = metadata["fps"] + meta_frames = metadata.get("total_num_frames", total_frames) + max_frame_idx = meta_frames - 1 + duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1) + + do_sample_frames = metadata.get("do_sample_frames", True) + if not do_sample_frames: + frame_indices = metadata["frames_indices"] + else: + DYNAMIC_FPS_THRES = {30: 3, 300: 1, 2400: 0.5} + MAX_FRAME_COUNT_DYNAMIC = 640 + MAX_DURATION = 2400 + + effective_duration = min(duration, MAX_DURATION) + if effective_duration <= 30: + target_fps = DYNAMIC_FPS_THRES[30] + elif effective_duration <= 300: + target_fps = DYNAMIC_FPS_THRES[300] + else: + target_fps = DYNAMIC_FPS_THRES[2400] + + temporal_patch_size = getattr(video_processor, "temporal_patch_size", 1) + extract_t = int(effective_duration * target_fps * temporal_patch_size) + extract_t = min(extract_t, MAX_FRAME_COUNT_DYNAMIC) + + duration_per_frame = 1 / video_fps + timestamps = [i * duration_per_frame for i in range(meta_frames)] + max_second = int(duration) + + if meta_frames < extract_t: + frame_indices = np.linspace( + 0, meta_frames - 1, extract_t, dtype=int + ).tolist() + else: + frame_indices = [] + current_second = 0.0 + inv_fps = 1 / (temporal_patch_size * target_fps) + for frame_index in range(meta_frames): + if timestamps[frame_index] >= current_second: + current_second += inv_fps + frame_indices.append(frame_index) + if current_second >= max_second: + break + + if len(frame_indices) < extract_t: + if len(frame_indices) == 0: + start, end = 0, max(meta_frames - 1, 0) + else: + start, end = frame_indices[0], frame_indices[-1] + frame_indices = np.linspace(start, end, extract_t, dtype=int).tolist() + elif len(frame_indices) > extract_t: + frame_indices = np.linspace( + 0, meta_frames - 1, extract_t, dtype=int + ).tolist() + + seen, uniq = set(), [] + for idx in frame_indices: + if idx not in seen: + seen.add(idx) + uniq.append(idx) + + if len(uniq) & 1: + uniq.append(uniq[-1]) + + frame_indices = uniq + full_second_idxs = [int(idx / video_fps) for idx in frame_indices] + timestamps_list = full_second_idxs[::2] + selected_timestamps = [] + for idx in range(len(timestamps_list)): + selected_timestamps.append(timestamps_list[idx]) + return selected_timestamps + def _construct_video_placeholder( self, video_array: np.ndarray, @@ -1097,9 +1174,18 @@ class Glm4vProcessingInfo(BaseProcessingInfo): merge_length = image_processor.merge_size**2 assert isinstance(grid_thw, torch.Tensor) - timestamps = self._get_video_second_idx(metadata, len(video_array)) + timestamps = ( + self._get_video_second_idx_glm4v(metadata, len(video_array)) + if isinstance(hf_processor, Glm4vProcessor) + else self._get_video_second_idx_glm46v(metadata, len(video_array)) + ) + + timestamp_format = ( + "{}" if isinstance(hf_processor, Glm4vProcessor) else "{:.1f} seconds" + ) frames_idx_token = [ - tokenizer.encode(str(i), add_special_tokens=False) for i in timestamps + tokenizer.encode(timestamp_format.format(i), add_special_tokens=False) + for i in timestamps ] T, H, W = grid_thw num_tokens_per_frame = int(H * W) // merge_length From 61728cd1dfb03cbbfa03924f2a2cda311cfc13ac Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:32:19 -0500 Subject: [PATCH 025/249] Re-enable FlashInfer for Llama4 on Blackwell in e2e fusion tests (#28966) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič --- .buildkite/test-pipeline.yaml | 2 ++ tests/compile/distributed/test_fusions_e2e.py | 12 ++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d4b6f4077ab32..98daebcc06931 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -930,6 +930,8 @@ steps: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 2e1b595a43895..661172e1965b5 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -47,12 +47,8 @@ if current_platform.is_cuda(): ModelBackendTestCase( # Use smaller model for L40s in CI model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell - # so FI attention+fp8_quant is at least tested once model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.FLASHINFER - if is_blackwell() - else AttentionBackendEnum.TRITON_ATTN, + backend=AttentionBackendEnum.TRITON_ATTN, matches=Matches( attention_fusion=32, allreduce_fusion=65, @@ -65,9 +61,9 @@ if current_platform.is_cuda(): model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), # TODO FlashInfer attn broken on Hopper with kvcache=fp8: # https://github.com/vllm-project/vllm/issues/28568 - # TODO FlashInfer attn broken on Blackwell for llama4: - # https://github.com/vllm-project/vllm/issues/28604 - backend=AttentionBackendEnum.TRITON_ATTN, + backend=AttentionBackendEnum.FLASHINFER + if is_blackwell() + else AttentionBackendEnum.TRITON_ATTN, matches=Matches( attention_fusion=48, allreduce_fusion=96, From 3319a493fcc3e4733382f0dc812184234e9c3dcb Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 19 Nov 2025 11:20:22 -0800 Subject: [PATCH 026/249] [Core] Reuse created spec tokens lists to mitigate GC cost (#28917) Signed-off-by: Jialin Ouyang --- vllm/v1/worker/gpu_input_batch.py | 18 ++++++++++++------ vllm/v1/worker/gpu_model_runner.py | 3 ++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 023b5edb2c340..c1bfe727d86e5 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -251,7 +251,7 @@ class InputBatch: self.logitsprocs_need_output_token_ids = logitsprocs_need_output_token_ids # Store last speculative tokens for sampler. - self.spec_token_ids: list[list[int] | None] = [] + self.spec_token_ids: list[list[int]] = [[] for _ in range(max_num_reqs)] # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() @@ -313,7 +313,7 @@ class InputBatch: else: self._req_ids[req_index] = req_id self.req_output_token_ids[req_index] = request.output_token_ids - self.spec_token_ids[req_index] = [] + self.spec_token_ids[req_index].clear() self.req_id_to_index[req_id] = req_index @@ -462,7 +462,7 @@ class InputBatch: self.batch_update_builder.removed_append(req_index) self._req_ids[req_index] = None self.req_output_token_ids[req_index] = None - self.spec_token_ids[req_index] = None + self.spec_token_ids[req_index].clear() # LoRA lora_id = self.request_lora_mapping[req_index] @@ -654,9 +654,15 @@ class InputBatch: self.req_output_token_ids[last_req_index] = None self.req_id_to_index[req_id] = empty_index - spec_token_ids = self.spec_token_ids[last_req_index] - self.spec_token_ids[empty_index] = spec_token_ids - self.spec_token_ids[last_req_index] = None + if last_req_index != empty_index: + ( + self.spec_token_ids[last_req_index], + self.spec_token_ids[empty_index], + ) = ( + self.spec_token_ids[empty_index], + self.spec_token_ids[last_req_index], + ) + self.spec_token_ids[last_req_index].clear() num_tokens = self.num_tokens[last_req_index] self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3b00085b6bb99..0c35f1330e9f0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -892,7 +892,8 @@ class GPUModelRunner( # conform to the schema. This can result in # scheduler_output.scheduled_spec_decode_tokens being empty, # even when speculative decoding is enabled. - self.input_batch.spec_token_ids[req_index] = spec_token_ids + self.input_batch.spec_token_ids[req_index].clear() + self.input_batch.spec_token_ids[req_index].extend(spec_token_ids) # there are no draft tokens with async scheduling, # we clear the spec_decoding info in scheduler_output and From fe69f331f84d99541564dfe4852dd45220ed7875 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Wed, 19 Nov 2025 14:23:54 -0500 Subject: [PATCH 027/249] [Kernels] Improve H200 Fused MoE Config (#28992) Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 122 +++++++++--------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json index 6fcf408755f5d..532c16e899269 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json @@ -1,11 +1,11 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, + "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 5 + "num_stages": 4 }, "2": { "BLOCK_SIZE_M": 16, @@ -13,82 +13,82 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "4": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 4 }, "8": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, - "16": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 3 - }, - "24": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4, - "num_stages": 3 - }, - "32": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 3 - }, - "48": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 3 - }, "64": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 64, "num_warps": 4, "num_stages": 3 }, "96": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 3 - }, - "128": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, "256": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, @@ -96,10 +96,10 @@ "num_stages": 3 }, "512": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 32, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 }, @@ -109,7 +109,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "1536": { "BLOCK_SIZE_M": 64, @@ -117,21 +117,21 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "2048": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 32, "num_warps": 4, - "num_stages": 3 + "num_stages": 4 }, "3072": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 16, + "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4 }, @@ -139,7 +139,7 @@ "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 64, + "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 3 } From 9d2d5612573c20f8bf00242a8525c2a5dcfe4c06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?= <38908462+zhyajie@users.noreply.github.com> Date: Thu, 20 Nov 2025 03:30:57 +0800 Subject: [PATCH 028/249] [Bugfix] Fix precision corruption when shared_experts_stream=None (#28942) Signed-off-by: zhyajie Co-authored-by: zhyajie --- vllm/model_executor/layers/fused_moe/layer.py | 11 +++++++---- vllm/utils/torch_utils.py | 3 +-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c41995e4a9136..8e9bba3442873 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -371,8 +371,8 @@ class FusedMoE(CustomOp): logger.info_once("Disabling MoE shared_experts cuda stream") self.shared_experts_stream = None else: - # TODO(rob): enable shared expert overlap with non-cuda. - # aux_stream() returns None on non-cuda platforms. + # TODO(rob): enable shared expert overlap with non-cuda-alike. + # aux_stream() returns None on non-cuda-alike platforms. self.shared_experts_stream = aux_stream() if self.shared_experts_stream is not None: logger.info_once("Enabled separate cuda stream for MoE shared_experts") @@ -1865,6 +1865,11 @@ class FusedMoE(CustomOp): hidden_states_combined, router_logits = get_ep_group().dispatch( hidden_states, router_logits, self.is_sequence_parallel ) + # Run shared experts before matrix multiply. + # because matrix multiply maybe modify the hidden_states. + if has_separate_shared_experts and not use_shared_experts_stream: + assert self.shared_experts is not None + shared_output = self.shared_experts(hidden_states) # Matrix multiply. final_hidden_states = self.quant_method.apply( @@ -1908,8 +1913,6 @@ class FusedMoE(CustomOp): # conflict with the main stream shared_output = self.shared_experts(hidden_states_clone) current_stream().wait_stream(self.shared_experts_stream) - else: - shared_output = self.shared_experts(hidden_states) final_hidden_states = ( shared_output, diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index 7c094e14cff72..3661dfd09047a 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -426,8 +426,7 @@ def aux_stream() -> torch.cuda.Stream | None: from vllm.platforms import current_platform - # TODO: validate this works properly on ROCm platform. - if _aux_stream is None and current_platform.is_cuda(): + if _aux_stream is None and current_platform.is_cuda_alike(): _aux_stream = torch.cuda.Stream() return _aux_stream From ac10fd3c6900228e3c0a8fae20d039668c132446 Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Date: Wed, 19 Nov 2025 11:59:30 -0800 Subject: [PATCH 029/249] Upstreaming aiter triton attention backend as a new backend (#28701) Signed-off-by: Aleksandr Malyshev Co-authored-by: Aleksandr Malyshev --- vllm/attention/backends/registry.py | 3 + vllm/platforms/rocm.py | 4 +- .../backends/mla/aiter_triton_mla.py | 74 +++++++++++++++++++ 3 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 vllm/v1/attention/backends/mla/aiter_triton_mla.py diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index 51899b0235915..91e1cad01f4fd 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -46,6 +46,9 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend" ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend" ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend" + ROCM_AITER_TRITON_MLA = ( + "vllm.v1.attention.backends.mla.aiter_triton_mla.AiterTritonMLABackend" + ) ROCM_AITER_FA = ( "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend" ) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index bb116792fed54..f07f068a9249b 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -234,7 +234,6 @@ class RocmPlatform(Platform): if rocm_aiter_ops.is_mla_enabled() or block_size == 1 else AttentionBackendEnum.TRITON_MLA ) - if selected_backend == AttentionBackendEnum.TRITON_MLA: if block_size != 1: logger.info_once("Using Triton MLA backend.") @@ -246,6 +245,9 @@ class RocmPlatform(Platform): if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA: logger.info("Using AITER MLA backend.") return AttentionBackendEnum.ROCM_AITER_MLA.get_path() + if selected_backend == AttentionBackendEnum.ROCM_AITER_TRITON_MLA: + logger.info("Using AITER TRITON MLA backend.") + return AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path() raise ValueError( f" The selected backend, {selected_backend.name}," diff --git a/vllm/v1/attention/backends/mla/aiter_triton_mla.py b/vllm/v1/attention/backends/mla/aiter_triton_mla.py new file mode 100644 index 0000000000000..8a92152a0ca53 --- /dev/null +++ b/vllm/v1/attention/backends/mla/aiter_triton_mla.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.v1.attention.backends.mla.common import MLACommonBackend +from vllm.v1.attention.backends.mla.rocm_aiter_mla import ( + AiterMLAImpl, + AiterMLAMetadataBuilder, +) + + +class AiterTritonMLABackend(MLACommonBackend): + @staticmethod + def get_name() -> str: + return "AITER_TRITON_MLA" + + @staticmethod + def get_impl_cls() -> type["AiterTritonMLAImpl"]: + return AiterTritonMLAImpl + + @staticmethod + def get_builder_cls() -> type["AiterMLAMetadataBuilder"]: + return AiterMLAMetadataBuilder + + +class AiterTritonMLAImpl(AiterMLAImpl): + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: list[float] | None, + sliding_window: int | None, + kv_cache_dtype: str, + logits_soft_cap: float | None, + attn_type: str, + kv_sharing_target_layer_name: str | None, + # MLA Specific Arguments + **mla_args, + ) -> None: + super().__init__( + num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window, + kv_cache_dtype, + logits_soft_cap, + attn_type, + kv_sharing_target_layer_name, + **mla_args, + ) + from aiter.ops.triton.mha import flash_attn_varlen_func + + self.flash_attn_varlen_func = flash_attn_varlen_func + + def _flash_attn_varlen_diff_headdims( + self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs + ): + result = self.flash_attn_varlen_func( + q, + k, + v, + softmax_scale=softmax_scale, + return_lse=return_softmax_lse, + **kwargs, + ) + # Transpose the LSE if Triton MHA is used: + # (q.shape[0], num_q_heads) to (num_q_heads, q.shape[0]) + if type(result) is tuple and return_softmax_lse: + output, lse = result + lse = lse.T.contiguous() + return (output, lse) + return result From 02f5903b84cfdf0b7cb31d46e995e3d4b9ad9e53 Mon Sep 17 00:00:00 2001 From: Izzy Putterman Date: Wed, 19 Nov 2025 12:01:05 -0800 Subject: [PATCH 030/249] Eagle: MM Cuda Graphs with MRope (#28896) Signed-off-by: Izzy Putterman Co-authored-by: Cyrus Leung --- vllm/model_executor/models/llama_eagle3.py | 14 ++++++-------- vllm/v1/spec_decode/eagle.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 75c671311b491..3eaf2d80082f1 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -23,7 +23,6 @@ from vllm.model_executor.model_loader.weight_utils import ( maybe_remap_kv_scale_name, ) from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM -from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import NestedTensors from .utils import ( @@ -121,13 +120,12 @@ class LlamaDecoderLayer(LlamaDecoderLayer): @support_torch_compile( - # torch.compile is disabled for multimodal EAGLE3 models due to constraint - # violations with dynamic shapes during tensor concatenation operations. - # See: https://github.com/vllm-project/vllm/pull/22872/files#r2362028132 - # Non-multimodal EAGLE3 models can still use torch.compile safely. - enable_if=lambda vllm_config: not MULTIMODAL_REGISTRY.supports_multimodal_inputs( - vllm_config.model_config - ), + dynamic_arg_dims={ + "input_ids": 0, + "positions": -1, + "hidden_states": 0, + "input_embeds": 0, + } ) class LlamaModel(nn.Module): def __init__( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 5bf2503c3027d..406bb696bd4cf 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -116,9 +116,18 @@ class EagleProposer: ) self.uses_mrope = self.vllm_config.model_config.uses_mrope if self.uses_mrope: - # M-RoPE need (3, max_num_tokens) + # NOTE: `mrope_positions` is implemented with one additional dummy + # position on purpose to make it non-contiguous so that it can work + # with torch compile. + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 self.mrope_positions = torch.zeros( - (3, self.max_num_tokens), dtype=torch.int64, device=device + (3, self.max_num_tokens + 1), dtype=torch.int64, device=device ) else: # RoPE need (max_num_tokens,) From 2fd893b4cec0975a2a8430077fd9b4f294eb3561 Mon Sep 17 00:00:00 2001 From: Qiu Date: Thu, 20 Nov 2025 04:52:44 +0800 Subject: [PATCH 031/249] [Feature] Prefill Context Parallel (PCP) basic support (#28718) Signed-off-by: QiuChunshuo Signed-off-by: FENP Signed-off-by: LookAround Signed-off-by: Jingchun Gao Signed-off-by: zhenwenqi2024 Co-authored-by: FENP Co-authored-by: LookAround Co-authored-by: Jingchun Gao Co-authored-by: zhenwenqi2024 Co-authored-by: Jingchun Gao <63247409+gjc0824@users.noreply.github.com> --- tests/distributed/test_context_parallel.py | 12 +-- .../moe/modular_kernel_tools/common.py | 7 +- tests/v1/worker/test_gpu_model_runner.py | 4 +- vllm/attention/backends/abstract.py | 17 +++++ vllm/attention/ops/common.py | 40 +++++++++- vllm/config/parallel.py | 40 +++++++--- vllm/config/vllm.py | 32 ++++++-- vllm/distributed/parallel_state.py | 74 +++++++++++++++---- vllm/engine/arg_utils.py | 22 ++++++ .../model_executor/layers/fused_moe/config.py | 59 ++++++++++----- vllm/model_executor/layers/fused_moe/layer.py | 32 ++++++++ vllm/model_executor/models/gpt_oss.py | 9 ++- vllm/v1/attention/backends/flash_attn.py | 6 +- vllm/v1/attention/backends/mla/common.py | 6 +- vllm/v1/attention/backends/utils.py | 18 ++--- vllm/v1/core/kv_cache_coordinator.py | 17 +++++ vllm/v1/core/kv_cache_manager.py | 9 +-- vllm/v1/core/kv_cache_utils.py | 13 +++- vllm/v1/core/sched/scheduler.py | 2 + vllm/v1/core/single_type_kv_cache_manager.py | 19 ++++- vllm/v1/engine/core.py | 1 + vllm/v1/executor/multiproc_executor.py | 23 ++++-- vllm/v1/kv_cache_interface.py | 5 +- vllm/v1/worker/block_table.py | 35 +++++---- vllm/v1/worker/gpu_input_batch.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 4 +- vllm/v1/worker/gpu_worker.py | 3 + 27 files changed, 399 insertions(+), 114 deletions(-) diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index b16fd0d06b145..7e4713b8aece0 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -31,7 +31,7 @@ class ParallelSetup(NamedTuple): tp_size: int pp_size: int dcp_size: int - dcp_kv_cache_interleave_size: int + cp_kv_cache_interleave_size: int eager_mode: bool chunked_prefill: bool @@ -55,7 +55,7 @@ class CPTestSettings: tp_base: int = 4, pp_base: int = 1, dcp_base: int = 1, - dcp_kv_cache_interleave_size: int = 1, + cp_kv_cache_interleave_size: int = 1, multi_node_only: bool = False, runner: RunnerOption = "auto", load_format: str | None = None, @@ -71,7 +71,7 @@ class CPTestSettings: tp_size=tp_base, pp_size=pp_multiplier * pp_base, dcp_size=int(dcp_multiplier * tp_base), - dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=cp_kv_cache_interleave_size, eager_mode=eager_mode_val, chunked_prefill=chunked_prefill_val, ) @@ -116,7 +116,7 @@ def _compare_cp_with_tp( tp_size, pp_size, dcp_size, - dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size, eager_mode, chunked_prefill, ) = parallel_setup @@ -197,7 +197,7 @@ def _compare_cp_with_tp( "--decode-context-parallel-size", str(dcp_size), "--dcp-kv-cache-interleave-size", - str(dcp_kv_cache_interleave_size), + str(cp_kv_cache_interleave_size), "--distributed-executor-backend", distributed_backend, ] @@ -227,7 +227,7 @@ CP_TEXT_GENERATION_MODELS = { "deepseek-ai/DeepSeek-V2-Lite-Chat": [ CPTestSettings.detailed(), CPTestSettings.detailed(tp_base=2), - CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64), + CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64), ], "bigcode/gpt_bigcode-santacoder": [ CPTestSettings.detailed(), diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 1d925dc1bea8f..d95c22fdf0a5b 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -15,7 +15,11 @@ from tests.kernels.quantization.nvfp4_utils import ( ) from tests.kernels.utils import torch_experts from vllm.config import VllmConfig -from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size +from vllm.distributed import ( + get_dp_group, + get_pcp_group, + get_tensor_model_parallel_world_size, +) from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -561,6 +565,7 @@ def make_modular_kernel( # make moe config moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( tp_size_=get_tensor_model_parallel_world_size(), + pcp_size_=get_pcp_group().world_size, dp_size_=get_dp_group().world_size, vllm_parallel_config=vllm_config.parallel_config, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index b95c8df3469b3..824e458978350 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -956,7 +956,7 @@ def test_hybrid_block_table_initialization(): max_num_reqs = 10 max_num_blocks_per_req = 20 max_num_batched_tokens = 512 - dcp_kv_cache_interleave_size = 8 + cp_kv_cache_interleave_size = 8 block_table = BlockTable( block_size=block_size, @@ -966,7 +966,7 @@ def test_hybrid_block_table_initialization(): pin_memory=False, device=torch.device(DEVICE), kernel_block_size=kernel_block_sizes[0], - dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=cp_kv_cache_interleave_size, ) # Verify hybrid block configuration diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 9275d70fd86a4..d28bc065852db 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -266,6 +266,12 @@ class AttentionImpl(ABC, Generic[T]): dcp_world_size: int dcp_rank: int + pcp_world_size: int + pcp_rank: int + + total_cp_world_size: int + total_cp_rank: int + def __new__(cls, *args, **kwargs): # use __new__ so that all subclasses will call this self = super().__new__(cls) @@ -278,6 +284,17 @@ class AttentionImpl(ABC, Generic[T]): # DCP might not be initialized in testing self.dcp_world_size = 1 self.dcp_rank = 0 + try: + from vllm.distributed.parallel_state import get_pcp_group + + self.pcp_world_size = get_pcp_group().world_size + self.pcp_rank = get_pcp_group().rank_in_group + except AssertionError: + self.pcp_world_size = 1 + self.pcp_rank = 0 + self.total_cp_world_size = self.pcp_world_size * self.dcp_world_size + self.total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank + self.need_to_return_lse_for_decode = ( self.dcp_world_size > 1 and self.can_return_lse_for_decode ) diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 2cbb5c91cc3b3..67c5f7dbba9c0 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -169,12 +169,11 @@ def correct_attn_out( return out, lse -def cp_lse_ag_out_rs( +def _cp_lse_common( cp_attn_out: torch.Tensor, cp_attn_lse: torch.Tensor, cp_group: GroupCoordinator, - ctx: CPTritonContext = None, - return_lse=False, + ctx: CPTritonContext | None = None, ): """ cp_attn_out: [ B, H, D ] @@ -195,6 +194,22 @@ def cp_lse_ag_out_rs( cp_attn_lse = cp_attn_lse.contiguous() lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) + assert out.is_contiguous() + return out, lse + + +def cp_lse_ag_out_rs( + cp_attn_out: torch.Tensor, + cp_attn_lse: torch.Tensor, + cp_group: GroupCoordinator, + ctx: CPTritonContext | None = None, + return_lse: bool = False, +): + """ + cp_attn_out: [ B, H, D ] + cp_attn_lse: [ B, H ] + """ + out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx) out = cp_group.reduce_scatter(out, dim=1) if return_lse: @@ -205,6 +220,25 @@ def cp_lse_ag_out_rs( return out +def cp_lse_ag_out_ar( + cp_attn_out: torch.Tensor, + cp_attn_lse: torch.Tensor, + cp_group: GroupCoordinator, + ctx: CPTritonContext | None = None, + return_lse: bool = False, +): + """ + cp_attn_out: [ B, H, D ] + cp_attn_lse: [ B, H ] + """ + out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx) + out = cp_group.all_reduce(out) + + if return_lse: + return out, lse + return out + + @triton.jit def _pack_seq_kernel( x_ptr, # [N, D] diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 0f107a7a3ef83..4b0236d8de3f5 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -71,6 +71,8 @@ class ParallelConfig: """Number of pipeline parallel groups.""" tensor_parallel_size: int = 1 """Number of tensor parallel groups.""" + prefill_context_parallel_size: int = 1 + """Number of prefill context parallel groups.""" data_parallel_size: int = 1 """Number of data parallel groups. MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.""" @@ -239,14 +241,25 @@ class ParallelConfig: needs to be divisible by dcp_size.""" dcp_kv_cache_interleave_size: int = 1 - """Interleave size of kv_cache storage while using dcp or cp > 1, - store interleave_size tokens on (d)cp i, - then store next interleave_size tokens on (d)cp i+1. - Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size. - Interleave_size=block_size: block-level align, first fill the block on first rank, - token is stored on rank i+1 block j after rank i block j is full. - Block_size should be greater than or equal to dcp_kv_cache_interleave_size. - Block_size should be divisible by dcp_kv_cache_interleave_size. + """ + Interleave size of kv_cache storage while using DCP. + dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size, + and will be deprecated when PCP is fully supported. + + """ + cp_kv_cache_interleave_size: int = 1 + """Interleave size of kv_cache storage while using DCP or PCP. + For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`, + and `total_cp_world_size = pcp_world_size * dcp_world_szie`. + store interleave_size tokens on total_cp_rank i, + then store next interleave_size tokens on taotal_cp_rank i+1. + Interleave_size=1: token-level alignment, where token `i` is stored on + total_cp_rank `i % total_cp_world_size`. + Interleave_size=block_size: block-level alignment, where tokens are + first populated to the preceding ranks. Tokens are then stored + in (rank i+1, block j) only after (rank i, block j) is fully occupied. + Block_size should be greater than or equal to cp_kv_cache_interleave_size. + Block_size should be divisible by cp_kv_cache_interleave_size. """ _api_process_count: int = Field(default=1, gt=0) @@ -311,6 +324,11 @@ class ParallelConfig: "num_redundant_experts." ) + if self.prefill_context_parallel_size > 1: + raise ValueError( + "Prefill context parallelism is not fully supported. " + "Please set prefill_context_parallel_size to 1." + ) return self @property @@ -529,7 +547,11 @@ class ParallelConfig: ) # Continue with the rest of the initialization - self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size + self.world_size = ( + self.pipeline_parallel_size + * self.tensor_parallel_size + * self.prefill_context_parallel_size + ) if self.distributed_executor_backend == "external_launcher": logger.info("Using external launcher for distributed inference.") diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 672b004c4aa56..d64e315b4fe39 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -481,6 +481,14 @@ class VllmConfig: "Overriding cudagraph_mode to PIECEWISE." ) self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE + # prefill context parallel do not support full cudagraphs + elif self.parallel_config.prefill_context_parallel_size > 1: + logger.warning_once( + "Prefill context parallel (PCP) is enabled, which is " + "incompatible with full CUDA graphs. " + "Overriding cudagraph_mode to PIECEWISE." + ) + self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE elif self.model_config is not None: if self.model_config.pooler_config is not None: logger.warning_once( @@ -610,22 +618,34 @@ class VllmConfig: # If DCP, ensure the block size is right. if self.parallel_config.decode_context_parallel_size > 1: + if self.parallel_config.dcp_kv_cache_interleave_size > 1 and ( + self.parallel_config.cp_kv_cache_interleave_size + != self.parallel_config.dcp_kv_cache_interleave_size + ): + self.parallel_config.cp_kv_cache_interleave_size = ( + self.parallel_config.dcp_kv_cache_interleave_size + ) + logger.warning_once( + "cp_kv_cache_interleave_size is overridden by dcp_kv_cache" + "_interleave_size. And dcp-kv-cache-interleave-size will be " + "deprecated when PCP is fully supported." + ) assert ( - self.parallel_config.dcp_kv_cache_interleave_size + self.parallel_config.cp_kv_cache_interleave_size <= self.cache_config.block_size and self.cache_config.block_size - % self.parallel_config.dcp_kv_cache_interleave_size + % self.parallel_config.cp_kv_cache_interleave_size == 0 ), ( f"Block_size({self.cache_config.block_size}) should be greater " - "than or equal to and divisible by dcp_kv_cache_interleave_size " - f"({self.parallel_config.dcp_kv_cache_interleave_size})." + "than or equal to and divisible by cp_kv_cache_interleave_size " + f"({self.parallel_config.cp_kv_cache_interleave_size})." ) assert ( - self.parallel_config.dcp_kv_cache_interleave_size == 1 + self.parallel_config.cp_kv_cache_interleave_size == 1 or self.speculative_config is None - ), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now." + ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now." # Do this after all the updates to compilation_config.mode if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 852c4c644433f..f81612fd1f4a3 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1098,6 +1098,12 @@ get_context_model_parallel_group = get_dcp_group _PP: GroupCoordinator | None = None + +def get_pp_group() -> GroupCoordinator: + assert _PP is not None, "pipeline model parallel group is not initialized" + return _PP + + _DP: GroupCoordinator | None = None @@ -1114,9 +1120,12 @@ def get_ep_group() -> GroupCoordinator: return _EP -def get_pp_group() -> GroupCoordinator: - assert _PP is not None, "pipeline model parallel group is not initialized" - return _PP +_PCP: GroupCoordinator | None = None + + +def get_pcp_group() -> GroupCoordinator: + assert _PCP is not None, "prefill context parallel group is not initialized" + return _PCP @deprecated( @@ -1276,6 +1285,7 @@ def init_distributed_environment( def initialize_model_parallel( tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, + prefill_context_model_parallel_size: int = 1, decode_context_model_parallel_size: int | None = 1, backend: str | None = None, ) -> None: @@ -1325,7 +1335,11 @@ def initialize_model_parallel( # to get group_ranks for each dimension, transpose that dimension to the # last dimension, then reshape to 2D, then unbind the last dimension all_ranks = torch.arange(world_size).reshape( - -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size + -1, + data_parallel_size, + pipeline_model_parallel_size, + prefill_context_model_parallel_size, + tensor_model_parallel_size, ) # noqa # Build the tensor model-parallel groups. @@ -1360,11 +1374,23 @@ def initialize_model_parallel( group_name="dcp", ) + global _PCP + assert _PCP is None, "prefill context parallel group is already initialized" + group_ranks = ( + all_ranks.transpose(3, 4) + .reshape(-1, prefill_context_model_parallel_size) + .unbind(0) + ) + group_ranks = [x.tolist() for x in group_ranks] + _PCP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="pcp" + ) + # Build the pipeline model-parallel groups. global _PP assert _PP is None, "pipeline model parallel group is already initialized" group_ranks = ( - all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0) + all_ranks.transpose(2, 4).reshape(-1, pipeline_model_parallel_size).unbind(0) ) group_ranks = [x.tolist() for x in group_ranks] _PP = init_model_parallel_group( @@ -1373,7 +1399,7 @@ def initialize_model_parallel( global _DP assert _DP is None, "data parallel group is already initialized" - group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0) + group_ranks = all_ranks.transpose(1, 4).reshape(-1, data_parallel_size).unbind(0) group_ranks = [x.tolist() for x in group_ranks] _DP = init_model_parallel_group( group_ranks, get_world_group().local_rank, backend, group_name="dp" @@ -1383,7 +1409,12 @@ def initialize_model_parallel( assert _EP is None, "expert parallel group is already initialized" group_ranks = ( all_ranks.transpose(1, 2) - .reshape(-1, data_parallel_size * tensor_model_parallel_size) + .reshape( + -1, + data_parallel_size + * prefill_context_model_parallel_size + * tensor_model_parallel_size, + ) .unbind(0) ) group_ranks = [x.tolist() for x in group_ranks] @@ -1393,11 +1424,13 @@ def initialize_model_parallel( logger.info_once( "rank %s in world size %s is assigned as " - "DP rank %s, PP rank %s, TP rank %s, EP rank %s", + "DP rank %s, PP rank %s, PCP rank %s, " + "TP rank %s, EP rank %s", rank, world_size, _DP.rank_in_group, _PP.rank_in_group, + _PCP.rank_in_group, _TP.rank_in_group, _EP.rank_in_group, ) @@ -1406,6 +1439,7 @@ def initialize_model_parallel( def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, + prefill_context_model_parallel_size: int = 1, decode_context_model_parallel_size: int | None = 1, backend: str | None = None, ) -> None: @@ -1418,6 +1452,7 @@ def ensure_model_parallel_initialized( initialize_model_parallel( tensor_model_parallel_size, pipeline_model_parallel_size, + prefill_context_model_parallel_size, decode_context_model_parallel_size, backend, ) @@ -1434,6 +1469,12 @@ def ensure_model_parallel_initialized( f"got: {pp_world_size=} vs. " f"wanted: {pipeline_model_parallel_size=}" ) + pcp_world_size = get_pcp_group().world_size + assert pcp_world_size == prefill_context_model_parallel_size, ( + "prefill context parallel group already initialized, but of unexpected size: " + f"{pcp_world_size=} vs. " + f"{prefill_context_model_parallel_size=}" + ) def prepare_communication_buffer_for_model(model: torch.nn.Module): @@ -1445,6 +1486,8 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module): """ if _TP is not None: _TP.prepare_communication_buffer_for_model(model) + if _PCP is not None: + _PCP.prepare_communication_buffer_for_model(model) if _PP is not None: _PP.prepare_communication_buffer_for_model(model) if _DP is not None: @@ -1520,16 +1563,21 @@ def destroy_model_parallel(): _TP.destroy() _TP = None - global _PP - if _PP: - _PP.destroy() - _PP = None - global _DCP if _DCP: _DCP.destroy() _DCP = None + global _PCP + if _PCP: + _PCP.destroy() + _PCP = None + + global _PP + if _PP: + _PP.destroy() + _PP = None + global _DP if _DP: _DP.destroy() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e2f7326448b3a..68205b6079d78 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -389,8 +389,10 @@ class EngineArgs: nnodes: int = ParallelConfig.nnodes node_rank: int = ParallelConfig.node_rank tensor_parallel_size: int = ParallelConfig.tensor_parallel_size + prefill_context_parallel_size: int = ParallelConfig.prefill_context_parallel_size decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size + cp_kv_cache_interleave_size: int = ParallelConfig.cp_kv_cache_interleave_size data_parallel_size: int = ParallelConfig.data_parallel_size data_parallel_rank: int | None = None data_parallel_start_rank: int | None = None @@ -770,6 +772,15 @@ class EngineArgs: "--dcp-kv-cache-interleave-size", **parallel_kwargs["dcp_kv_cache_interleave_size"], ) + parallel_group.add_argument( + "--cp-kv-cache-interleave-size", + **parallel_kwargs["cp_kv_cache_interleave_size"], + ) + parallel_group.add_argument( + "--prefill-context-parallel-size", + "-pcp", + **parallel_kwargs["prefill_context_parallel_size"], + ) parallel_group.add_argument( "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"] ) @@ -1600,6 +1611,7 @@ class EngineArgs: parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, + prefill_context_parallel_size=self.prefill_context_parallel_size, data_parallel_size=self.data_parallel_size, data_parallel_rank=self.data_parallel_rank or 0, data_parallel_external_lb=data_parallel_external_lb, @@ -1631,6 +1643,7 @@ class EngineArgs: worker_extension_cls=self.worker_extension_cls, decode_context_parallel_size=self.decode_context_parallel_size, dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=self.cp_kv_cache_interleave_size, _api_process_count=self._api_process_count, _api_process_rank=self._api_process_rank, ) @@ -1952,6 +1965,15 @@ class EngineArgs: default_prefix_caching, ) = self.get_chunked_prefill_prefix_caching_defaults(model_config) + if self.prefill_context_parallel_size > 1: + default_chunked_prefill = False + default_prefix_caching = False + logger.warning( + "--prefill-context-parallel-size > 1 is not compatible with " + "chunked prefill and prefix caching now. Chunked prefill " + "and prefix caching have been disabled by default." + ) + if self.enable_chunked_prefill is None: self.enable_chunked_prefill = default_chunked_prefill diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index a7bd64b1c65e9..21eb4d590a7d1 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -8,7 +8,11 @@ import torch import vllm.envs as envs from vllm.config import ParallelConfig -from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank +from vllm.distributed import ( + get_dp_group, + get_pcp_group, + get_tensor_model_parallel_rank, +) from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( OCP_MX_DTYPES, @@ -684,9 +688,11 @@ FUSED_MOE_UNQUANTIZED_CONFIG: FusedMoEQuantConfig = FusedMoEQuantConfig.make() @dataclass class FusedMoEParallelConfig: tp_size: int + pcp_size: int dp_size: int ep_size: int tp_rank: int + pcp_rank: int dp_rank: int ep_rank: int @@ -713,19 +719,22 @@ class FusedMoEParallelConfig: return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency" @staticmethod - def flatten_tp_across_dp( - tp_size: int, dp_size: int, dp_rank: int + def flatten_tp_across_dp_and_pcp( + tp_size: int, dp_size: int, dp_rank: int, pcp_size: int, pcp_rank: int ) -> tuple[int, int]: tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank() - # There are actually dp_size * tp_size devices. Update tp_size - # and tp_rank so we shard across all devices. - flatten_tp_size = dp_size * tp_size - flatten_tp_rank = dp_rank * tp_size + tp_rank + # There are actually dp_size * pcp_size * tp_size devices. + # Update tp_size and tp_rank so we shard across all devices. + flatten_tp_size = dp_size * pcp_size * tp_size + flatten_tp_rank = dp_rank * pcp_size * tp_size + pcp_rank * tp_size + tp_rank return flatten_tp_size, flatten_tp_rank @staticmethod def make( - tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig + tp_size_: int, + pcp_size_: int, + dp_size_: int, + vllm_parallel_config: ParallelConfig, ) -> "FusedMoEParallelConfig": """ Determine MoE parallel configuration. Based on the input `tp_size_`, @@ -734,19 +743,22 @@ class FusedMoEParallelConfig: Args: tp_size_ (int): `tp_size` passed into the FusedMoE constructor. + pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor. dp_size_ (int): `dp_size` passed into the FusedMoE constructor. vllm_parallel_config (ParallelConfig): vLLM's parallel config object which contains the `enable_expert_parallel` flag. Examples: When there is no parallelism requested, - i.e. `tp_size_` = `dp_size_` = 1, we simply return the sizes + i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes unaltered and the ranks set to 0. - Expert Parallelism is considered only when either `dp_size_` or + Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or `tp_size_` is non trivial. - When TP = 2, DP = 1 and EP = False, the configuration on different + Note that PCP serves the same function as DP here. + + When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different devices: - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // @@ -754,7 +766,7 @@ class FusedMoEParallelConfig: - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0} - Comment : Tensors are sharded across 2 devices. - When TP = 1, DP = 2 and EP = False, the configuration on different + When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different devices: - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0} @@ -762,7 +774,7 @@ class FusedMoEParallelConfig: - Comment: There are 2 engine instances and the tensors are sharded across 2 decvices. - When TP = 2, DP = 2 and EP = False, the configuration on different + When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different devices: - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0} @@ -772,14 +784,14 @@ class FusedMoEParallelConfig: - Comment: There are 2 engine instances and the tensors are sharded across 4 devices. - When, TP = 2, DP = 1 and EP = True, the configuration on different + When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different devices: - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0} - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1} - Comment: The experts are split between the 2 devices. - When, TP = 1, DP = 2 and EP = True, the configuration on different + When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different devices: - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0} @@ -787,7 +799,7 @@ class FusedMoEParallelConfig: - Comment: There are 2 engine instances and the experts are split between the 2 devices. - When TP = 2, DP = 2 and EP = True, the configuration on different + When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different devices: - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0} @@ -798,18 +810,25 @@ class FusedMoEParallelConfig: between the 4 devices. """ - use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config.enable_expert_parallel + use_ep = ( + dp_size_ * pcp_size_ * tp_size_ > 1 + and vllm_parallel_config.enable_expert_parallel + ) dp_size = dp_size_ dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0 - tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp( - tp_size_, dp_size_, dp_rank + pcp_size = pcp_size_ + pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0 + tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp( + tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank ) if not use_ep: return FusedMoEParallelConfig( tp_size=tp_size, tp_rank=tp_rank, + pcp_size=pcp_size, + pcp_rank=pcp_rank, dp_size=dp_size, dp_rank=dp_rank, ep_size=1, @@ -826,6 +845,8 @@ class FusedMoEParallelConfig: return FusedMoEParallelConfig( tp_size=1, tp_rank=0, + pcp_size=pcp_size, + pcp_rank=pcp_rank, dp_size=dp_size, dp_rank=dp_rank, ep_size=ep_size, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 8e9bba3442873..7b15e63e9e350 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -18,6 +18,7 @@ from vllm.config.parallel import ExpertPlacementStrategy from vllm.distributed import ( get_dp_group, get_ep_group, + get_pcp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce, ) @@ -343,6 +344,7 @@ class FusedMoE(CustomOp): tp_size: int | None = None, ep_size: int | None = None, dp_size: int | None = None, + pcp_size: int | None = None, prefix: str = "", custom_routing_function: Callable | None = None, scoring_func: str = "softmax", @@ -398,12 +400,14 @@ class FusedMoE(CustomOp): tp_size if tp_size is not None else get_tensor_model_parallel_world_size() ) dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size + pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size self.is_sequence_parallel = is_sequence_parallel self.sp_size = tp_size_ if is_sequence_parallel else 1 self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( tp_size_=tp_size_, + pcp_size_=pcp_size_, dp_size_=dp_size_, vllm_parallel_config=vllm_config.parallel_config, ) @@ -679,6 +683,10 @@ class FusedMoE(CustomOp): def dp_size(self): return self.moe_parallel_config.dp_size + @property + def pcp_size(self): + return self.moe_parallel_config.pcp_size + @property def ep_size(self): return self.moe_parallel_config.ep_size @@ -691,6 +699,10 @@ class FusedMoE(CustomOp): def dp_rank(self): return self.moe_parallel_config.dp_rank + @property + def pcp_rank(self): + return self.moe_parallel_config.pcp_rank + @property def ep_rank(self): return self.moe_parallel_config.ep_rank @@ -1871,6 +1883,19 @@ class FusedMoE(CustomOp): assert self.shared_experts is not None shared_output = self.shared_experts(hidden_states) + # NOTE: Similar with DP, PCP also needs dispatch and combine. For + # simplicity, AgRsAll2All was added separately for PCP here. Maybe + # we should modify All2AllManager abstract to better support PCP. + if self.pcp_size > 1: + hidden_states = get_pcp_group().all_gather( + hidden_states, + dim=0, + ) + router_logits = get_pcp_group().all_gather( + router_logits, + dim=0, + ) + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, @@ -1925,6 +1950,13 @@ class FusedMoE(CustomOp): def combine_output(states: torch.Tensor) -> torch.Tensor: if do_naive_dispatch_combine: states = get_ep_group().combine(states, self.is_sequence_parallel) + + if self.pcp_size > 1: + states = get_pcp_group().reduce_scatter( + states, + dim=0, + ) + return states if self.shared_experts is not None: diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index f310f71af92d9..25048330f7974 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -13,6 +13,7 @@ from vllm.config import CacheConfig, VllmConfig from vllm.distributed import ( get_dp_group, get_ep_group, + get_pcp_group, get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -322,10 +323,12 @@ class GptOssModel(nn.Module): # In MoE, we need to flatten the tensor parallel size across the data # parallel size when EP is disabled. - tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp( + tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp( tp_size=get_tensor_model_parallel_world_size(), dp_size=get_dp_group().world_size, dp_rank=get_dp_group().rank_in_group, + pcp_size=get_pcp_group().world_size, + pcp_rank=get_pcp_group().rank_in_group, ) intermediate_size = self.config.intermediate_size @@ -507,10 +510,12 @@ class GptOssModel(nn.Module): # In MoE, we need to flatten the tensor parallel size across the data # parallel size when EP is disabled. - tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp( + tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp( tp_size=get_tensor_model_parallel_world_size(), dp_size=get_dp_group().world_size, dp_rank=get_dp_group().rank_in_group, + pcp_size=get_pcp_group().world_size, + pcp_rank=get_pcp_group().rank_in_group, ) intermediate_size = self.config.intermediate_size diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index fdc99a0df1c8a..cf3c1d05f5b3f 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -265,8 +265,8 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad self.dcp_world_size = 1 self.dcp_rank = 0 - self.dcp_kv_cache_interleave_size = ( - self.parallel_config.dcp_kv_cache_interleave_size + self.cp_kv_cache_interleave_size = ( + self.parallel_config.cp_kv_cache_interleave_size ) self.use_full_cuda_graph = ( @@ -388,7 +388,7 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad dcp_context_kv_lens_cpu, self.dcp_world_size, self.dcp_rank, - self.dcp_kv_cache_interleave_size, + self.cp_kv_cache_interleave_size, ) dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device) max_dcp_context_kv_len = dcp_context_kv_lens.max().item() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index e328049b53c7e..32f406980f2ed 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -536,7 +536,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): # DCP might not be initialized in testing self.dcp_world_size = 1 self.dcp_rank = 0 - self.dcp_local_block_size = parallel_config.dcp_kv_cache_interleave_size + self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size # Don't try to access the runner on AMD @@ -1289,8 +1289,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): get_current_vllm_config() ) ) - self.dcp_kv_cache_interleave_size: int = ( - get_current_vllm_config().parallel_config.dcp_kv_cache_interleave_size + self.cp_kv_cache_interleave_size: int = ( + get_current_vllm_config().parallel_config.cp_kv_cache_interleave_size ) def _flash_attn_varlen_diff_headdims( diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 0dd1896331291..540a8e2b1d016 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1080,9 +1080,9 @@ def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor): def get_dcp_local_seq_lens( seq_lens: torch.Tensor, - dcp_world_size: int = 1, + dcp_size: int = 1, dcp_rank: int | None = None, - dcp_kv_cache_interleave_size: int = 1, + cp_kv_cache_interleave_size: int = 1, ) -> torch.Tensor: """While using dcp, kv_cache size stored on each rank may be different, use this function to calculate split decode seq_lens of each dcp rank. @@ -1091,7 +1091,7 @@ def get_dcp_local_seq_lens( num_requests = seq_lens.size(0) if dcp_rank is None: rank_offsets = ( - torch.arange(dcp_world_size, dtype=torch.int32) + torch.arange(dcp_size, dtype=torch.int32) .unsqueeze(0) .repeat(num_requests, 1) ) @@ -1102,15 +1102,15 @@ def get_dcp_local_seq_lens( ) base = ( seq_lens_tiled - // dcp_kv_cache_interleave_size - // dcp_world_size - * dcp_kv_cache_interleave_size + // cp_kv_cache_interleave_size + // dcp_size + * cp_kv_cache_interleave_size ) - remainder = seq_lens_tiled - base * dcp_world_size + remainder = seq_lens_tiled - base * dcp_size remainder = torch.clip( - remainder - rank_offsets * dcp_kv_cache_interleave_size, + remainder - rank_offsets * cp_kv_cache_interleave_size, 0, - dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size, ) dcp_local_seq_lens = base + remainder return dcp_local_seq_lens.squeeze(1) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index 137e5e0cdb6d2..1531b61f88fe2 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -27,6 +27,7 @@ class KVCacheCoordinator(ABC): enable_caching: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ): self.kv_cache_config = kv_cache_config self.max_model_len = max_model_len @@ -44,6 +45,7 @@ class KVCacheCoordinator(ABC): block_pool=self.block_pool, kv_cache_group_id=i, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) for i, kv_cache_group in enumerate(self.kv_cache_config.kv_cache_groups) ) @@ -210,6 +212,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator): use_eagle: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ): super().__init__( kv_cache_config, @@ -218,6 +221,7 @@ class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator): False, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) self.num_single_type_manager = len(self.single_type_managers) @@ -250,6 +254,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): enable_caching: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ): super().__init__( kv_cache_config, @@ -258,12 +263,16 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): enable_caching, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec self.block_size = self.kv_cache_spec.block_size self.dcp_world_size = dcp_world_size + self.pcp_world_size = pcp_world_size if dcp_world_size > 1: self.block_size *= dcp_world_size + if pcp_world_size > 1: + self.block_size *= pcp_world_size assert len(self.kv_cache_config.kv_cache_groups) == 1, ( "UnitaryKVCacheCoordinator assumes only one kv cache group" ) @@ -281,6 +290,7 @@ class UnitaryKVCacheCoordinator(KVCacheCoordinator): kv_cache_spec=self.kv_cache_spec, use_eagle=self.use_eagle, dcp_world_size=self.dcp_world_size, + pcp_world_size=self.pcp_world_size, ) return hit_blocks, len(hit_blocks[0]) * self.block_size @@ -302,6 +312,7 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): enable_caching: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ): super().__init__( kv_cache_config, @@ -310,8 +321,10 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): enable_caching, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) assert dcp_world_size == 1, "DCP not support hybrid attn now." + assert pcp_world_size == 1, "PCP not support hybrid attn now." self.verify_and_split_kv_cache_groups() def verify_and_split_kv_cache_groups(self) -> None: @@ -452,6 +465,7 @@ def get_kv_cache_coordinator( enable_caching: bool, enable_kv_cache_events: bool, dcp_world_size: int, + pcp_world_size: int, ) -> KVCacheCoordinator: if not enable_caching: return KVCacheCoordinatorNoPrefixCache( @@ -460,6 +474,7 @@ def get_kv_cache_coordinator( use_eagle, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) if len(kv_cache_config.kv_cache_groups) == 1: return UnitaryKVCacheCoordinator( @@ -469,6 +484,7 @@ def get_kv_cache_coordinator( enable_caching, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) return HybridKVCacheCoordinator( kv_cache_config, @@ -477,4 +493,5 @@ def get_kv_cache_coordinator( enable_caching, enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 7f405fc248ac2..2012c3fef88bc 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -100,6 +100,7 @@ class KVCacheManager: log_stats: bool = False, enable_kv_cache_events: bool = False, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> None: self.max_model_len = max_model_len @@ -124,12 +125,9 @@ class KVCacheManager: 0 ].kv_cache_spec.block_size - if dcp_world_size > 1: + if dcp_world_size * pcp_world_size > 1: assert len(kv_cache_config.kv_cache_groups) == 1 - # Note(hc): need revisit. When both DCP and any future - # PCP are enabled, the block_size may need to be scaled - # by a factor of dcp_size × pcp_size? - self.block_size *= dcp_world_size + self.block_size *= dcp_world_size * pcp_world_size self.coordinator = get_kv_cache_coordinator( kv_cache_config=kv_cache_config, @@ -138,6 +136,7 @@ class KVCacheManager: enable_caching=self.enable_caching, enable_kv_cache_events=enable_kv_cache_events, dcp_world_size=dcp_world_size, + pcp_world_size=pcp_world_size, ) self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups) self.block_pool = self.coordinator.block_pool diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6e026215d4022..01ecd881115df 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1219,11 +1219,16 @@ def _report_kv_cache_config( // len(kv_cache_config.kv_cache_groups) * min_block_size ) - if vllm_config.parallel_config.decode_context_parallel_size > 1: - num_tokens *= vllm_config.parallel_config.decode_context_parallel_size + dcp_size = vllm_config.parallel_config.decode_context_parallel_size + pcp_size = vllm_config.parallel_config.prefill_context_parallel_size + if pcp_size * dcp_size > 1: + num_tokens *= pcp_size * dcp_size logger.info( - "Multiplying the GPU KV cache size by the dcp_world_size %d.", - vllm_config.parallel_config.decode_context_parallel_size, + "Multiplying the GPU KV cache size by the cp_world_size %d " + "(pcp_world_size %d * dcp_world_size %d).", + pcp_size * dcp_size, + pcp_size, + dcp_size, ) num_tokens_str = f"{num_tokens:,}" logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local") diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4323141c435b7..4cc4c29591cc0 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -121,6 +121,7 @@ class Scheduler(SchedulerInterface): self.block_size = block_size self.dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size + self.pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size # req_id -> Request self.requests: dict[str, Request] = {} @@ -183,6 +184,7 @@ class Scheduler(SchedulerInterface): log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, dcp_world_size=self.dcp_world_size, + pcp_world_size=self.pcp_world_size, ) self.use_pp = self.parallel_config.pipeline_parallel_size > 1 diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 14ac83028ee44..d90ec550f7666 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -32,6 +32,7 @@ class SingleTypeKVCacheManager(ABC): block_pool: BlockPool, kv_cache_group_id: int, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> None: """ Initializes the SingleTypeKVCacheManager. @@ -42,8 +43,9 @@ class SingleTypeKVCacheManager(ABC): """ self.block_size = kv_cache_spec.block_size self.dcp_world_size = dcp_world_size - if self.dcp_world_size > 1: - self.block_size *= dcp_world_size + self.pcp_world_size = pcp_world_size + if dcp_world_size * pcp_world_size > 1: + self.block_size *= dcp_world_size * pcp_world_size self.kv_cache_spec = kv_cache_spec self.block_pool = block_pool @@ -212,6 +214,7 @@ class SingleTypeKVCacheManager(ABC): kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: """ Get the longest cache hit prefix of the blocks that is not longer than @@ -303,6 +306,7 @@ class FullAttentionManager(SingleTypeKVCacheManager): kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance( kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec) @@ -314,8 +318,8 @@ class FullAttentionManager(SingleTypeKVCacheManager): [] for _ in range(len(kv_cache_group_ids)) ) block_size = kv_cache_spec.block_size - if dcp_world_size > 1: - block_size *= dcp_world_size + if dcp_world_size * pcp_world_size > 1: + block_size *= dcp_world_size * pcp_world_size max_num_blocks = max_length // block_size for block_hash in itertools.islice(block_hashes, max_num_blocks): # block_hashes is a chain of block hashes. If a block hash is not @@ -362,11 +366,13 @@ class SlidingWindowManager(SingleTypeKVCacheManager): kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, SlidingWindowSpec), ( "SlidingWindowManager can only be used for sliding window groups" ) assert dcp_world_size == 1, "DCP not support sliding window attn now." + assert pcp_world_size == 1, "PCP not support sliding window attn now." # The number of contiguous blocks needed for prefix cache hit. # -1 since the input token itself is also included in the window @@ -476,6 +482,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: """ For chunked local attention, we need to find the longest cache hit @@ -516,6 +523,7 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): "Hybrid KV cache is not supported for " + "eagle + chunked local attention." ) assert dcp_world_size == 1, "DCP not support chunked local attn now." + assert pcp_world_size == 1, "PCP not support chunked local attn now." max_num_blocks = max_length // kv_cache_spec.block_size if max_length > 0: local_attention_start_idx = ( @@ -611,11 +619,13 @@ class MambaManager(SingleTypeKVCacheManager): kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, MambaSpec), ( "MambaManager can only be used for mamba groups" ) assert dcp_world_size == 1, "DCP not support mamba now." + assert pcp_world_size == 1, "PCP not support mamba now." computed_blocks: tuple[list[KVCacheBlock], ...] = tuple( [] for _ in range(len(kv_cache_group_ids)) ) @@ -705,6 +715,7 @@ class CrossAttentionManager(SingleTypeKVCacheManager): kv_cache_spec: KVCacheSpec, use_eagle: bool, dcp_world_size: int = 1, + pcp_world_size: int = 1, ) -> tuple[list[KVCacheBlock], ...]: assert isinstance(kv_cache_spec, CrossAttentionSpec), ( "CrossAttentionManager can only be used for cross-attention groups" diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 3a25827cec385..6be19894d332a 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -128,6 +128,7 @@ class EngineCore: scheduler_block_size = ( vllm_config.cache_config.block_size * vllm_config.parallel_config.decode_context_parallel_size + * vllm_config.parallel_config.prefill_context_parallel_size ) self.scheduler: SchedulerInterface = Scheduler( diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index ad2ece50f9815..7e8ebe25c4603 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -35,6 +35,7 @@ from vllm.distributed.parallel_state import ( get_dp_group, get_ep_group, get_inner_dp_world_group, + get_pcp_group, get_pp_group, get_tp_group, ) @@ -110,12 +111,14 @@ class MultiprocExecutor(Executor): f"({self.parallel_config.nnodes_within_dp}). " ) self.local_world_size = self.parallel_config.local_world_size - tensor_parallel_size = self.parallel_config.tensor_parallel_size - pp_parallel_size = self.parallel_config.pipeline_parallel_size - assert self.world_size == tensor_parallel_size * pp_parallel_size, ( + tp_size = self.parallel_config.tensor_parallel_size + pp_size = self.parallel_config.pipeline_parallel_size + pcp_size = self.parallel_config.prefill_context_parallel_size + assert self.world_size == tp_size * pp_size * pcp_size, ( f"world_size ({self.world_size}) must be equal to the " - f"tensor_parallel_size ({tensor_parallel_size}) x pipeline" - f"_parallel_size ({pp_parallel_size}). " + f"tensor_parallel_size ({tp_size}) x pipeline" + f"_parallel_size ({pp_size}) x prefill_context" + f"_parallel_size ({pcp_size}). " ) # Set multiprocessing envs @@ -424,7 +427,11 @@ class MultiprocExecutor(Executor): # 16-23, PP rank 2 # 24-31, PP rank 3 # so world_size - tp_size = 32 - 8 = 24 should be PP rank = -1 (i.e. 3) - return self.world_size - self.parallel_config.tensor_parallel_size + return ( + self.world_size + - self.parallel_config.tensor_parallel_size + * self.parallel_config.prefill_context_parallel_size + ) @dataclass @@ -828,6 +835,8 @@ class WorkerProc: dp_rank = get_dp_group().rank_in_group pp_size = get_pp_group().world_size pp_rank = get_pp_group().rank_in_group + pcp_size = get_pcp_group().world_size + pcp_rank = get_pcp_group().rank_in_group tp_size = get_tp_group().world_size tp_rank = get_tp_group().rank_in_group dcp_size = get_dcp_group().world_size @@ -837,6 +846,8 @@ class WorkerProc: process_name += f"_DP{dp_rank}" if pp_size > 1: process_name += f"_PP{pp_rank}" + if pcp_size > 1: + process_name += f"_PCP{pcp_rank}" if tp_size > 1: process_name += f"_TP{tp_rank}" if dcp_size > 1: diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 7f33eb7e699c7..751862aa9c767 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -95,10 +95,11 @@ class FullAttentionSpec(AttentionSpec): def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size + pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size # Note(hc): each dcp rank only need save # (max_model_len//dcp_world_size) tokens locally. - if dcp_world_size > 1: - max_model_len = cdiv(max_model_len, dcp_world_size) + if dcp_world_size * pcp_world_size > 1: + max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size) return cdiv(max_model_len, self.block_size) * self.page_size_bytes @classmethod diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 9f6c19e464308..76e17f3797a1a 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -4,7 +4,7 @@ import numpy as np import torch -from vllm.distributed import get_dcp_group +from vllm.distributed import get_dcp_group, get_pcp_group from vllm.logger import init_logger from vllm.utils.math_utils import cdiv from vllm.v1.utils import CpuGpuBuffer @@ -22,7 +22,7 @@ class BlockTable: pin_memory: bool, device: torch.device, kernel_block_size: int, - dcp_kv_cache_interleave_size: int, + cp_kv_cache_interleave_size: int, ): """ Args: @@ -80,6 +80,13 @@ class BlockTable: else: self._kernel_block_arange = None + try: + self.pcp_world_size = get_pcp_group().world_size + self.pcp_rank = get_pcp_group().rank_in_group + except AssertionError: + # DCP might not be initialized in testing + self.pcp_world_size = 1 + self.pcp_rank = 0 try: self.dcp_world_size = get_dcp_group().world_size self.dcp_rank = get_dcp_group().rank_in_group @@ -87,7 +94,7 @@ class BlockTable: # DCP might not be initialized in testing self.dcp_world_size = 1 self.dcp_rank = 0 - self.dcp_kv_cache_interleave_size = dcp_kv_cache_interleave_size + self.cp_kv_cache_interleave_size = cp_kv_cache_interleave_size def append_row( self, @@ -131,14 +138,16 @@ class BlockTable: # NOTE(woosuk): We can't simply use `token_indices // block_size` # here because M (max_model_len) is not necessarily divisible by # block_size. - if self.dcp_world_size > 1: + total_cp_world_size = self.pcp_world_size * self.dcp_world_size + total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank + if total_cp_world_size > 1: # Note(hc): The DCP implement store kvcache with an interleave # style, the kvcache for the token whose token_idx is i is # always stored on the GPU whose dcp_rank equals i % cp_world_size: # Use a "virtual block" which equals to world_size * block_size # for block_table_indices calculation. - virtual_block_size = self.block_size * self.dcp_world_size + virtual_block_size = self.block_size * total_cp_world_size block_table_indices = ( req_indices * self.max_num_blocks_per_req + positions // virtual_block_size @@ -150,16 +159,16 @@ class BlockTable: virtual_block_offsets = positions % virtual_block_size mask = ( virtual_block_offsets - // self.dcp_kv_cache_interleave_size - % self.dcp_world_size - == self.dcp_rank + // self.cp_kv_cache_interleave_size + % total_cp_world_size + == total_cp_rank ) # Calculate local block_offsets block_offsets = ( virtual_block_offsets - // (self.dcp_world_size * self.dcp_kv_cache_interleave_size) - * self.dcp_kv_cache_interleave_size - + virtual_block_offsets % self.dcp_kv_cache_interleave_size + // (total_cp_world_size * self.cp_kv_cache_interleave_size) + * self.cp_kv_cache_interleave_size + + virtual_block_offsets % self.cp_kv_cache_interleave_size ) # Calculate slot_mapping slot_mapping = block_numbers * self.block_size + block_offsets @@ -253,7 +262,7 @@ class MultiGroupBlockTable: block_sizes: list[int], kernel_block_sizes: list[int], num_speculative_tokens: int = 0, - dcp_kv_cache_interleave_size: int = 1, + cp_kv_cache_interleave_size: int = 1, ) -> None: # Note(hc): each dcp rank only store # (max_model_len//dcp_world_size) tokens in kvcache, @@ -283,7 +292,7 @@ class MultiGroupBlockTable: pin_memory, device, kernel_block_size, - dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size, ) for block_size, kernel_block_size in zip(block_sizes, kernel_block_sizes) ] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index c1bfe727d86e5..7b4bc1d2a2241 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -87,7 +87,7 @@ class InputBatch: is_spec_decode: bool = False, is_pooling_model: bool = False, num_speculative_tokens: int = 0, - dcp_kv_cache_interleave_size: int = 1, + cp_kv_cache_interleave_size: int = 1, ): self.is_pooling_model = is_pooling_model self.is_spec_decode = is_spec_decode @@ -141,7 +141,7 @@ class InputBatch: block_sizes=block_sizes, kernel_block_sizes=kernel_block_sizes, num_speculative_tokens=num_speculative_tokens, - dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=cp_kv_cache_interleave_size, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0c35f1330e9f0..80f8344d44100 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -426,7 +426,7 @@ class GPUModelRunner( # uses output token ids so we set this conservatively. logitsprocs_need_output_token_ids=bool(custom_logitsprocs), is_pooling_model=self.is_pooling_model, - dcp_kv_cache_interleave_size=self.parallel_config.dcp_kv_cache_interleave_size, + cp_kv_cache_interleave_size=self.parallel_config.cp_kv_cache_interleave_size, ) self.use_async_scheduling = self.scheduler_config.async_scheduling @@ -1436,7 +1436,7 @@ class GPUModelRunner( self.seq_lens.cpu[:num_reqs], self.dcp_world_size, self.dcp_rank, - self.parallel_config.dcp_kv_cache_interleave_size, + self.parallel_config.cp_kv_cache_interleave_size, ) self.dcp_local_seq_lens.copy_to_gpu(num_reqs) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 315f01b68499a..b8339fc4dc8b8 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -26,6 +26,7 @@ from vllm.distributed.kv_transfer import ( has_kv_transfer_group, ) from vllm.distributed.parallel_state import ( + get_pcp_group, get_pp_group, get_tp_group, ) @@ -733,6 +734,7 @@ class Worker(WorkerBase): module.global_num_experts = module.moe_config.num_experts module.moe_parallel_config = FusedMoEParallelConfig.make( tp_size_=get_tp_group().world_size, + pcp_size_=get_pcp_group().world_size, dp_size_=get_dp_group().world_size, vllm_parallel_config=parallel_config, ) @@ -886,6 +888,7 @@ def init_worker_distributed_environment( ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size, + parallel_config.prefill_context_parallel_size, parallel_config.decode_context_parallel_size, ) From 68d7231991cc307d6865eac5bfca551c06f67465 Mon Sep 17 00:00:00 2001 From: Ryan Rock Date: Wed, 19 Nov 2025 15:04:36 -0600 Subject: [PATCH 032/249] [CI/Build] Fix test_prefix_prefill for AMD (#28905) Signed-off-by: Ryan Rock --- tests/kernels/attention/test_prefix_prefill.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 78cdbbbf7379d..e041e8c8d2ffa 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -174,11 +174,11 @@ def test_contexted_kv_attention( block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request) b_seq_len = torch.tensor(seq_lens, dtype=torch.int32) b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32) - b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0) + b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32) max_input_len = MAX_SEQ_LEN # copy kv to cache - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0 + b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to( + torch.int32 ) for i in range(BS): for j in range(query_lens[i]): @@ -417,11 +417,11 @@ def test_contexted_kv_attention_alibi( block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request) b_seq_len = torch.tensor(seq_lens, dtype=torch.int32) b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32) - b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0) + b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32) max_input_len = MAX_SEQ_LEN # copy kv to cache - b_seq_start_loc = torch.cumsum( - torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0 + b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to( + torch.int32 ) for i in range(BS): for j in range(query_lens[i]): From 1607e664f0de4b7eb113c0259b889edbe73c4341 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:18:32 -0500 Subject: [PATCH 033/249] [Bug] Fix Batch Invariant MLA test (#28967) Signed-off-by: yewentao256 --- tests/v1/determinism/test_batch_invariance.py | 41 +++++++++++++++---- vllm/model_executor/layers/batch_invariant.py | 2 +- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index f018ee551dbfe..d4e88891512c4 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -9,13 +9,33 @@ import torch from utils import _extract_step_logprobs, _random_prompt, skip_unsupported from vllm import LLM, SamplingParams +from vllm.platforms import current_platform + +BACKENDS: list[str] = [ + "FLASH_ATTN", + "FLASHINFER", +] + +if current_platform.is_cuda() and current_platform.is_device_capability(90): + BACKENDS.append("FLASH_ATTN_MLA") + +DEFAULT_MODEL = "Qwen/Qwen3-1.7B" +MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat" + + +def resolve_model_name(backend: str) -> str: + """Resolve the model name for the given backend, respecting env overrides.""" + model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL) + if backend.endswith("MLA") and model == DEFAULT_MODEL: + return MLA_MODEL + return model @skip_unsupported @pytest.mark.timeout(1000) @pytest.mark.parametrize( "backend", - ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], + BACKENDS, ) def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( backend, monkeypatch: pytest.MonkeyPatch @@ -47,7 +67,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # Allow overrides from environment (useful for CI tuning) # "facebook/opt-125m" is too small, doesn't reliably test determinism - model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model = resolve_model_name(backend) num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5")) max_batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "128")) min_random_prompt = int(os.getenv("VLLM_MIN_PROMPT", "1024")) @@ -150,7 +170,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle( @skip_unsupported @pytest.mark.parametrize( "backend", - ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], + BACKENDS, ) @pytest.mark.forked def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( @@ -160,7 +180,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model_name = resolve_model_name(backend) tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1")) # For batch invariance, disable custom all-reduce to ensure deterministic @@ -369,7 +389,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( @skip_unsupported @pytest.mark.parametrize( "backend", - ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], + BACKENDS, ) def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): """ @@ -377,7 +397,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): Useful for quick smoke testing and debugging. """ monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) - model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model = resolve_model_name(backend) llm = LLM( model=model, @@ -419,7 +439,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): @skip_unsupported @pytest.mark.parametrize( "backend", - ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"], + BACKENDS, ) @pytest.mark.forked def test_logprobs_without_batch_invariance_should_fail( @@ -434,6 +454,9 @@ def test_logprobs_without_batch_invariance_should_fail( The test will PASS if we detect differences (proving batch invariance matters). The test will FAIL if everything matches (suggesting batch invariance isn't needed). """ + from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant + + vllm_is_batch_invariant.cache_clear() monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # CRITICAL: Disable batch invariance for this test @@ -441,7 +464,7 @@ def test_logprobs_without_batch_invariance_should_fail( seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model_name = resolve_model_name(backend) tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1")) print(f"\n{'=' * 80}") @@ -659,7 +682,7 @@ def test_decode_logprobs_match_prefill_logprobs( seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + model_name = resolve_model_name(backend) tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1")) from vllm.model_executor.layers.batch_invariant import ( diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 7920d117de5e0..5dbeb29174349 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -803,11 +803,11 @@ def override_envs_for_invariance(): "FLASH_ATTN", # best supported backend "FLASHINFER", "FLASH_ATTN_MLA", - "FLASHINFER_MLA", "TRITON_MLA", # Not yet supported MLA backends # "FLASHMLA", # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance + # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967 ] if curr_attn_backend not in supported_backends: warning = ( From cdeec2e6067613c501f82463d54e420097f49750 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Wed, 19 Nov 2025 22:20:58 +0100 Subject: [PATCH 034/249] [BugFix] Ray with multiple nodes (#28873) Signed-off-by: Julien Denize --- vllm/v1/worker/gpu_worker.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index b8339fc4dc8b8..7f9cdd221224b 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -205,14 +205,14 @@ class Worker(WorkerBase): assert self.local_rank < torch.cuda.device_count(), ( f"DP adjusted local rank {self.local_rank} is out of bounds. " ) - visible_device_count = ( - torch.cuda.device_count() if torch.cuda.is_available() else 0 - ) - assert self.parallel_config.local_world_size <= visible_device_count, ( - f"local_world_size ({self.parallel_config.local_world_size}) must be " - f"less than or equal to the number of visible devices " - f"({visible_device_count})." - ) + visible_device_count = ( + torch.cuda.device_count() if torch.cuda.is_available() else 0 + ) + assert self.parallel_config.local_world_size <= visible_device_count, ( + f"local_world_size ({self.parallel_config.local_world_size}) must " + f"be less than or equal to the number of visible devices " + f"({visible_device_count})." + ) self.device = torch.device(f"cuda:{self.local_rank}") current_platform.set_device(self.device) From 613abb50d5715ba693ee9d5b727e8385b98e7185 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Wed, 19 Nov 2025 15:29:06 -0600 Subject: [PATCH 035/249] [MoE] Nvfp4 Masked Gemm: Add flashinfer grouped_gemm_nt_masked (#25990) Signed-off-by: Shu Wang. Signed-off-by: mgoin Co-authored-by: Michael Goin --- .buildkite/test-pipeline.yaml | 1 + tests/kernels/moe/test_cutedsl_moe.py | 582 ++++++++++++++++++ vllm/envs.py | 8 +- .../fused_moe/deepep_ll_prepare_finalize.py | 16 +- .../fused_moe/flashinfer_cutedsl_moe.py | 346 +++++++++++ .../layers/quantization/modelopt.py | 30 +- .../quantization/utils/flashinfer_fp4_moe.py | 43 +- .../quantization/utils/flashinfer_utils.py | 25 +- .../quantization/utils/nvfp4_moe_support.py | 6 +- vllm/utils/flashinfer.py | 42 ++ 10 files changed, 1064 insertions(+), 35 deletions(-) create mode 100644 tests/kernels/moe/test_cutedsl_moe.py create mode 100644 vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 98daebcc06931..5309581d8e81f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -921,6 +921,7 @@ steps: - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py - label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 diff --git a/tests/kernels/moe/test_cutedsl_moe.py b/tests/kernels/moe/test_cutedsl_moe.py new file mode 100644 index 0000000000000..af1a34d17d48b --- /dev/null +++ b/tests/kernels/moe/test_cutedsl_moe.py @@ -0,0 +1,582 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest + +from vllm.platforms import current_platform + +if not current_platform.has_device_capability(100): + pytest.skip( + reason="Nvfp4 Requires compute capability of 10 or above.", + allow_module_level=True, + ) + +import torch +from flashinfer import fp4_quantize +from torch.nn import functional as F + +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( + flashinfer_cutedsl_moe_masked, +) +from vllm.utils.flashinfer import ( + flashinfer_cutedsl_grouped_gemm_nt_masked as cutedsl_gmm_masked, +) +from vllm.utils.flashinfer import ( + scaled_fp4_grouped_quantize, +) + +kE2M1ToFloat = torch.tensor( + [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32 +) + +FLOAT8_E4M3_MAX = 448.0 +FLOAT4_E2M1_MAX = 6.0 + + +def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size): + m_tiles = (m + 128 - 1) // 128 + f = block_size * 4 + k_tiles = (k + f - 1) // f + tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4)) + tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5)) + out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size) + return out[0:m, 0:k] + + +def dequantize_nvfp4_to_dtype( + tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16 +): + """Dequantize the fp4 tensor back to high precision.""" + # Two fp4 values are packed into one uint8. + assert tensor_fp4.dtype == torch.uint8 + m, packed_k = tensor_fp4.shape + k = packed_k * 2 + tensor_f32 = break_fp4_bytes(tensor_fp4, dtype) + tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size) + tensor_sf = tensor_sf.view(torch.float8_e4m3fn) + tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size) + tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale + + # scale the tensor + out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k) + return out.to(dtype=dtype) + + +def break_fp4_bytes(a, dtype): + assert a.dtype == torch.uint8 + m, n = a.shape + + # Vectorized nibble processing + a_flat = a.flatten() + high = (a_flat & 0xF0) >> 4 # Upper nibbles + low = a_flat & 0x0F # Lower nibbles + + # Combine nibbles for batch processing + combined = torch.stack((low, high), dim=1).flatten() + + # Vectorized sign and magnitude extraction + signs = (combined & 0x08).to(torch.bool) # Sign bits + abs_vals = (combined & 0x07).to(torch.long) # Magnitude indices + + # Device-aware lookup and sign application + kE2M1 = kE2M1ToFloat.to(device=a.device) + values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0) + + # Reshape to final form + return values.reshape(m, n * 2).to(dtype=dtype) + + +def generate_balanced_routing( + hidden_states: torch.Tensor, num_experts: int, top_k: int +): + """ + Generate routing weights and topk indices such that every expert is active. + Returns routing_weights, topk_idx + """ + + num_tokens, hidden_dim = hidden_states.shape + # num_tokens = batch_size * seq_len + + # First, assign at least one token per expert + tokens_per_expert = torch.arange(num_tokens) % num_experts + tokens_per_expert = tokens_per_expert[torch.randperm(num_tokens)] # shuffle + + # Each token has top_k experts — start with one guaranteed expert + topk_idx = torch.full((num_tokens, top_k), -1, dtype=torch.long) + topk_idx[:, 0] = tokens_per_expert + + # For remaining top_k - 1 experts, pick randomly (allowing repeats) + if top_k > 1: + random_choices = torch.randint(0, num_experts, (num_tokens, top_k - 1)) + topk_idx[:, 1:] = random_choices + + # Normalize routing weights so each token's weights sum to 1 + routing_weights = torch.rand(num_tokens, top_k) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + + # Reshape back if needed + routing_weights = routing_weights.view(num_tokens, top_k) + topk_idx = topk_idx.view(num_tokens, top_k) + + return routing_weights, topk_idx + + +def prepare_inputs( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + num_experts: int, + topk: int, +): + routing_weights, topk_idx = generate_balanced_routing( + router_logits, num_experts, topk + ) + + masked_m = [] + for i in range(num_experts): + mask = topk_idx.view(-1) == i + masked_m.append(mask.sum()) + + masked_m = torch.tensor(masked_m, dtype=torch.int32) + # Intialize the hidden_states_3d with ones instead of empty to avoid nan + # issue. + hidden_states_3d = torch.ones( + (num_experts, max(masked_m), hidden_states.shape[1]), dtype=hidden_states.dtype + ) + for i in range(num_experts): + hidden_states_3d[i, : masked_m[i], :] = hidden_states[topk_idx.view(-1) == i] + + return hidden_states_3d, masked_m, topk_idx, routing_weights + + +MNK_FACTORS = [ + (2, 1024, 1024), + (2, 1024, 1536), + (2, 3072, 1024), + (2, 3072, 1536), + (64, 1024, 1024), + (64, 1024, 1536), + (64, 3072, 1024), + (64, 2048, 1024), + (224, 1024, 1024), + (224, 1024, 1536), +] + + +# Reference implementation of torch_moe +def torch_moe(a, w1, w2, score, topk, expert_map): + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + score = torch.softmax(score, dim=-1, dtype=torch.float32) + topk_weight, topk_ids = torch.topk(score, topk) + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + if expert_map is not None: + topk_ids = expert_map[topk_ids] + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose( + 0, 1 + ) + return ( + out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype) + ).sum(dim=1) + + +def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids): + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + + topk_weight = topk_weight.view(-1) + topk_ids = topk_ids.view(-1) + + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + m = w1[i].shape[0] + assert m % 2 == 0 + # Note: w1 and w3 are swapped! + w3_expert, w1_expert = w1[i][m // 2 :, :], w1[i][: m // 2, :] + inter = F.silu(a[mask] @ w1_expert.t()) * (a[mask] @ w3_expert.t()) + inter_gs = torch.tensor(1.0).cuda() + inter_q, inter_blockscale = fp4_quantize(inter, inter_gs) + inter = dequantize_nvfp4_to_dtype( + inter_q, + inter_blockscale, + inter_gs, + dtype=inter.dtype, + device=inter.device, + block_size=16, + ).cuda() + out[mask] = inter @ w2[i].transpose(0, 1) + return ( + out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype) + ).sum(dim=1) + + +def grouped_gemm_ref( + hidden_states_expanded: torch.Tensor, + hidden_states_3d: torch.Tensor, + weights: torch.Tensor, + topk_idx: torch.Tensor, + masked_m: torch.Tensor, + B: int, + topk: int, + num_experts: int, + *, + block_size: int = 16, +) -> torch.Tensor: + """ + Computes the reference grouped GEMM (fp4 quantized per-expert loop), + computes flashinfer grouped GEMM (for scale consistency), + and returns ONLY the repacked reference output: out_ref. + + Returns: + out_ref: Tensor [num_experts, max_m, n_out] + """ + device_hs = hidden_states_expanded.device + device_w = weights.device + out_dtype = weights.dtype + n_out = weights.shape[1] + + # Flattened reference output (B*topk, n_out) + out = torch.zeros((B * topk, n_out), dtype=out_dtype, device=device_w) + + # Per-expert reference compute loop + for i in range(num_experts): + mask = topk_idx.view(-1) == i + if mask.any(): + lhs = hidden_states_expanded[mask] + rhs = weights[i] + + a_amax = lhs.abs().max().to(torch.float32).to(device_hs) + b_amax = rhs.abs().max().to(torch.float32).to(device_w) + + a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax + b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax + + lhsq, lhsq_sf = fp4_quantize(lhs, a_gs) + rhsq, rhsq_sf = fp4_quantize(rhs, b_gs) + + lhs_in_dtype = dequantize_nvfp4_to_dtype( + lhsq, + lhsq_sf, + a_gs, + dtype=lhs.dtype, + device=device_hs, + block_size=block_size, + ) + rhs_in_dtype = dequantize_nvfp4_to_dtype( + rhsq, + rhsq_sf, + b_gs, + dtype=rhs.dtype, + device=device_w, + block_size=block_size, + ) + + out[mask] = lhs_in_dtype @ rhs_in_dtype.t() + + # Determine per-expert max_m + max_m_val = int(masked_m.max().item()) + + # Repack into [num_experts, max_m, n_out] + out_ref = torch.zeros( + (num_experts, max_m_val, n_out), + dtype=out.dtype, + device=out.device, + ) + expert_slot = [0] * num_experts + + for i, expert_id in enumerate(topk_idx.view(-1).tolist()): + slot = expert_slot[expert_id] + if slot < max_m_val: + out_ref[expert_id, slot, :] = out[i] + expert_slot[expert_id] += 1 + else: + raise IndexError( + f"Expert {expert_id} exceeded max slots ({max_m_val}). " + "Increase max_m or check masked_m." + ) + + return out_ref + + +def flashinfer_cutedsl_grouped_gemm_nt_masked( + hidden_states: torch.Tensor, # 3d + input_global_scale: torch.Tensor, # (l,) + weights: torch.Tensor, + w_global_scale: torch.Tensor, # (l,) + masked_m: torch.Tensor, +): + # hidden_states: [l, m, k] + # weights: [l, n, k] + aq, aq_sf = scaled_fp4_grouped_quantize( + hidden_states, + masked_m.to(hidden_states.device), + input_global_scale, + ) + num_experts, n, k = weights.shape + bq, bq_sf = scaled_fp4_grouped_quantize( + weights, + torch.full((num_experts,), n, device=weights.device, dtype=torch.int32), + w_global_scale, + ) + + out = torch.zeros( + (num_experts, max(masked_m), n), dtype=weights.dtype, device=aq.device + ) + out = out.permute(1, 2, 0) # requirement of kernel + sf_vec_size = 16 + ab_dtype = "float4_e2m1fn" + sf_dtype = "float8_e4m3fn" + c_dtype = "bfloat16" + alpha = 1.0 / (input_global_scale * w_global_scale).to(out.dtype).view( + 1, 1, num_experts + ) + + def get_cute_dtype(input: torch.Tensor) -> str: + if input.dtype == torch.bfloat16: + return "bfloat16" + elif input.dtype == torch.float16: + return "float16" + elif input.dtype == torch.float32: + return "float32" + else: + raise ValueError(f"Unsupported cute dtype {input.dtype}") + + cutedsl_gmm_masked( + (aq, aq_sf), + (bq, bq_sf), + out, + masked_m.to(aq.device), + ab_dtype=ab_dtype, + sf_dtype=sf_dtype, + c_dtype=c_dtype, + sf_vec_size=sf_vec_size, + alpha=alpha, + alpha_dtype=get_cute_dtype(alpha), + ) + + return out + + +@pytest.mark.parametrize("bs, hidden_dim, inter_dim", [(2, 128, 256), (16, 128, 512)]) +@pytest.mark.parametrize("topk", [1, 2, 4]) +@torch.inference_mode() +def test_flashinfer_cutedsl_moe_masked( + bs: int, hidden_dim: int, inter_dim: int, topk: int +): + torch.manual_seed(42) + device = "cuda" + num_experts = 8 + hidden_states = ( + torch.randn(bs, hidden_dim, dtype=torch.bfloat16, device=device) / 5.0 + ) + w1 = ( + torch.randn( + num_experts, 2 * inter_dim, hidden_dim, dtype=torch.bfloat16, device=device + ) + / 10.0 + ) + w2 = ( + torch.randn( + num_experts, hidden_dim, inter_dim, dtype=torch.bfloat16, device=device + ) + / 10.0 + ) + router_logits = torch.randn(bs, num_experts, dtype=torch.float32) + + hidden_states_expanded = ( + hidden_states.view(bs, -1, hidden_dim) + .repeat(1, topk, 1) + .reshape(-1, hidden_dim) + ) + hidden_states_3d, masked_m, topk_idx, routing_weights = prepare_inputs( + hidden_states_expanded, router_logits, num_experts, topk + ) + + w1_amax = w1.abs().amax(dim=(1, 2)).to(torch.float32).to(w1.device) + w2_amax = w2.abs().amax(dim=(1, 2)).to(torch.float32).to(w2.device) + input_global_scale = torch.ones( + (num_experts,), dtype=torch.float32, device=hidden_states.device + ) + + w1_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax + w2_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax + a2_global_scale = torch.ones( + (num_experts,), dtype=torch.float32, device=hidden_states.device + ) # assume intermediate scale is 1.0 + + w1_fp4, w1_blockscale = scaled_fp4_grouped_quantize( + w1, + torch.ones(num_experts, dtype=torch.int32, device=w1.device) * 2 * inter_dim, + w1_global_scale, + ) + w2_fp4, w2_blockscale = scaled_fp4_grouped_quantize( + w2, + torch.ones(num_experts, dtype=torch.int32, device=w2.device) * hidden_dim, + w2_global_scale, + ) + + w1_alpha = 1.0 / (input_global_scale * w1_global_scale) + w2_alpha = 1.0 / (a2_global_scale * w2_global_scale) + + out = torch.empty_like(hidden_states_3d) + # Note: the 1st dim shouldn't be bs + wk = torch.empty( + num_experts, + hidden_states_3d.shape[1], + inter_dim * 2, + dtype=hidden_states_3d.dtype, + device=hidden_states.device, + ) + flashinfer_cutedsl_moe_masked( + hidden_states_3d.to(hidden_states.device), + input_global_scale, + w1_fp4.permute(2, 0, 1), + w1_blockscale, + w1_alpha, + w2_fp4.permute(2, 0, 1), + a2_global_scale, + w2_blockscale, + w2_alpha, + masked_m.to(hidden_states.device), + wk, + out, + ) + + # reference + a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale) + a_in_dtype = dequantize_nvfp4_to_dtype( + a_fp4, + a_scale_interleaved, + input_global_scale, + dtype=hidden_states.dtype, + device=hidden_states.device, + block_size=16, + ) + w1_d = torch.empty( + (num_experts, 2 * inter_dim, hidden_dim), device=w1.device, dtype=w1.dtype + ) + w2_d = torch.empty( + (num_experts, hidden_dim, inter_dim), device=w2.device, dtype=w2.dtype + ) + + for idx in range(0, num_experts): + w1_fp4_sliced, w1_blockscale_sliced = fp4_quantize( + w1[idx], w1_global_scale[idx] + ) + w2_fp4_sliced, w2_blockscale_sliced = fp4_quantize( + w2[idx], w2_global_scale[idx] + ) + w1_d[idx] = dequantize_nvfp4_to_dtype( + w1_fp4_sliced, + w1_blockscale_sliced, + w1_global_scale[idx], + dtype=w1.dtype, + device=w1.device, + block_size=16, + ) + w2_d[idx] = dequantize_nvfp4_to_dtype( + w2_fp4_sliced, + w2_blockscale_sliced, + w2_global_scale[idx], + dtype=w2.dtype, + device=w2.device, + block_size=16, + ) + + ref_output = torch_moe_nvfp4( + a_in_dtype, + w1_d, + w2_d, + topk, + routing_weights.to(a_in_dtype.device), + topk_idx.to(a_in_dtype.device), + ) + out_weighted = torch.zeros_like(ref_output, device=out.device, dtype=out.dtype) + + positions = torch.nonzero(masked_m[topk_idx], as_tuple=False) + rows, cols = positions[:, 0], positions[:, 1] + experts = topk_idx[rows, cols] + for i in range(num_experts): + mask = experts == i + if mask.any(): + idx = torch.nonzero(mask, as_tuple=False).squeeze(-1) + r, c = rows[idx], cols[idx] + out_weighted[r] += out[i, : len(r), :] * routing_weights[r, c].to( + out.device + ).unsqueeze(-1) + torch.testing.assert_close( + out_weighted.cpu(), ref_output.cpu(), atol=2e-1, rtol=2e-1 + ) + + +@pytest.mark.parametrize( + "bs, hidden_dim, inter_dim, topk", [(2, 128, 256, 2), (16, 128, 512, 5)] +) +@torch.inference_mode() +def test_grouped_gemm_nt_masked( + bs: int, hidden_dim: int, inter_dim: int, topk: int +) -> None: + torch.manual_seed(42) + B = bs + D = hidden_dim + N = inter_dim + # CuteDSL group gemm has issue when not all experts are active. + # i.e. masked = [2, 3, 0, 0, 1] where the 2nd and 3rd experts are inactive + # see https://github.com/flashinfer-ai/flashinfer/issues/1856 + num_experts = bs + hidden_states = torch.randn(B, D, dtype=torch.bfloat16, device="cuda") + weights = torch.randn(num_experts, N, D, dtype=torch.bfloat16, device="cuda") + router_logits = torch.randn(B, num_experts, dtype=torch.float32) + + hidden_states_expanded = ( + hidden_states.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + ) + hidden_states_3d, masked_m, topk_idx, _ = prepare_inputs( + hidden_states_expanded, router_logits, num_experts, topk + ) + + a_amax = ( + hidden_states_3d.abs() + .amax(dim=(1, 2)) + .to(torch.float32) + .to(hidden_states.device) + ) + b_amax = weights.abs().amax(dim=(1, 2)).to(torch.float32).to(weights.device) + a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax + b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax + out_flashinfer = flashinfer_cutedsl_grouped_gemm_nt_masked( + hidden_states_3d.to(hidden_states.device), a_gs, weights, b_gs, masked_m + ) + # reference + out_ref = grouped_gemm_ref( + hidden_states_expanded=hidden_states_expanded, + hidden_states_3d=hidden_states_3d, + weights=weights, + topk_idx=topk_idx, + masked_m=masked_m, + B=B, + topk=topk, + num_experts=num_experts, + ) + # Note: just to compare the masked position due to cutedsl may write nan + # into unmasked position. + for i in range(num_experts): + torch.testing.assert_close( + out_flashinfer.permute(2, 0, 1)[i, : masked_m[i]], + out_ref.to(out_flashinfer.device)[i, : masked_m[i]], + atol=1e-1, + rtol=1e-1, + ) + + +if __name__ == "__main__": + test_flashinfer_cutedsl_moe_masked(16, 128, 512, 4) + test_grouped_gemm_nt_masked(16, 128, 512, 4) diff --git a/vllm/envs.py b/vllm/envs.py index 212d68114e46e..1ff620af57229 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -157,7 +157,9 @@ if TYPE_CHECKING: VLLM_USE_FLASHINFER_MOE_FP16: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False - VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency" + VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = ( + "latency" + ) VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024 VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 @@ -1238,7 +1240,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # - "latency": # Uses TensorRT-LLM kernels optimized for low-latency inference. "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices( - "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"] + "VLLM_FLASHINFER_MOE_BACKEND", + "latency", + ["throughput", "latency", "masked_gemm"], ), # Control the workspace buffer size for the FlashInfer backend. "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int( diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index e0db248958b47..fea9f49c04b89 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -6,6 +6,7 @@ import deep_ep import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import envs from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( @@ -27,6 +28,8 @@ logger = init_logger(__name__) DEEPEP_QUANT_BLOCK_SIZE = 128 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE] +logger = init_logger(__name__) + def dequant_fp8( expert_x_fp8: torch.Tensor, expert_x_scales: torch.Tensor @@ -187,16 +190,25 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): # TODO (varun): Optimization - Use a batched version of quant x = x.view((-1, hidden_dim)) + q_dtype = quant_config.quant_dtype + + if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm": + logger.info_once( + "Skip quantization when using FlashInfer CUTEDSL(masked_gemm) " + "for ModelOptNvFp4FusedMoE." + ) + q_dtype = None + x, x_scales = moe_kernel_quantize_input( x, quant_config.a1_scale, - quant_config.quant_dtype, + q_dtype, quant_config.per_act_token_quant, quant_config.block_shape, ) x = x.view((num_experts, -1, hidden_dim)) - if quant_config.quant_dtype is not None: + if q_dtype is not None: assert x_scales is not None x_scales = normalize_batched_scales_shape(x_scales, num_experts) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py new file mode 100644 index 0000000000000..2747ef04a3499 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py @@ -0,0 +1,346 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceDelegate, +) +from vllm.utils.flashinfer import ( + flashinfer_cutedsl_grouped_gemm_nt_masked, + has_flashinfer_cutedsl_grouped_gemm_nt_masked, + scaled_fp4_grouped_quantize, + silu_and_mul_scaled_nvfp4_experts_quantize, +) + +logger = init_logger(__name__) + + +def is_valid_flashinfer_cutedsl_fused_moe( + hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor +) -> bool: + """ + Check if the given problem size is supported by the FlashInfer CuteDSL MoE + kernel. + """ + if not has_flashinfer_cutedsl_grouped_gemm_nt_masked(): + logger.debug_once( + "FlashInferCuteDSLExperts disabled: " + "flashinfer_cutedsl_fused_moe not available." + ) + return False + # Data type checks + if ( + w1.dtype != torch.uint8 + or w2.dtype != torch.uint8 + or hidden_states.dtype not in [torch.float32, torch.float16, torch.bfloat16] + ): + logger.debug_once( + "FlashInferCuteDSLExperts disabled: w1/w2 must be torch.uint8 " + f"(got w1={w1.dtype}, w2={w2.dtype}), hidden_states must be " + f"float32, float16, or bfloat16 (got {hidden_states.dtype})." + ) + return False + return True + + +class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + out_dtype: torch.dtype, + quant_config: FusedMoEQuantConfig, + ): + super().__init__(quant_config) + assert quant_config.quant_dtype == "nvfp4", ( + "Only nvfp4 quantization are currently supported." + ) + self.out_dtype = out_dtype + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.BatchedExperts, + mk.FusedMoEActivationFormat.BatchedExperts, + ) + + def supports_expert_map(self) -> bool: + return False + + def supports_chunking(self) -> bool: + # This refers to TP chunking; DP chunking is handled separately. + # TODO(shuw@nvidia.com): Set to False to be consistent with + # batched_deep_gemm_moe + return False + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # Let PrepareAndFinalize::finalize() decide the impl. + return TopKWeightAndReduceDelegate() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # We use global_num_experts due to how moe_align_block_size handles + # expert_maps. + """ + Compute the shapes for the temporary and final outputs of the two gemms + and activation in the fused expert function. Since the gemms are + independent, the workspace for the first gemm can be shared with the + workspace for the last gemm. + + Returns a tuple of: + - workspace13 shape tuple: must be large enough to hold the + result of either expert gemm. + - workspace2 shape tuple: must be large enough to hold the + result of the activation function. + - output shape tuple: must be exact size of the final gemm output. + - Workspace type: The dtype to use for the workspace tensors. + - Note: in order for activation chunking to work, the first dimension + of each tuple must be the number of tokens. + """ + output_shape = (local_num_experts, M, K) + workspace2 = (local_num_experts, M, N) + workspace1 = output_shape + return (workspace1, workspace2, output_shape) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, # Not used + workspace13: torch.Tensor | None, + workspace2: torch.Tensor | None, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool | None, + ): + assert self.quant_dtype == "nvfp4", ( + "Only nvfp4 quantization are currently supported." + ) + # Ensure w1_scale and w2_scale are not None before calling view + assert self.w1_scale is not None and self.w2_scale is not None, ( + "w1_scale and w2_scale must not be None for FlashInferExperts" + ) + assert expert_tokens_meta is not None + expert_num_tokens = expert_tokens_meta.expert_num_tokens + assert hidden_states.ndim == 3 + assert self.w1_scale.ndim == 3 + assert self.w2_scale.ndim == 3 + flashinfer_cutedsl_moe_masked( + hidden_states=hidden_states, + input_global_scale=self.a1_gscale, + w1=w1, + w1_blockscale=self.w1_scale, + w1_alpha=self.g1_alphas, + w2=w2, + a2_global_scale=self.a2_gscale, + w2_blockscale=self.w2_scale, + w2_alpha=self.g2_alphas, + masked_m=expert_num_tokens, + workspace=workspace2, + out=output, + ) + + +def get_cute_dtype(input: torch.Tensor) -> str: + if input.dtype == torch.bfloat16: + return "bfloat16" + elif input.dtype == torch.float16: + return "float16" + elif input.dtype == torch.float32: + return "float32" + else: + raise ValueError(f"Unsupported cute dtype {input.dtype}") + + +def flashinfer_cutedsl_moe_masked( + hidden_states: torch.Tensor, + input_global_scale: torch.Tensor, + w1: torch.Tensor, + w1_blockscale: torch.Tensor, + w1_alpha, + w2: torch.Tensor, + a2_global_scale: torch.Tensor, + w2_blockscale: torch.Tensor, + w2_alpha, + masked_m: torch.Tensor, + workspace: torch.Tensor, + out: torch.Tensor, +): + """ + Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL + kernels. + + Args: + hidden_states (torch.Tensor): [num_experts, m, k], bf16 + input_global_scale (torch.Tensor): (l,) + w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8 + w1_blockscale (torch.Tensor): blockscale factors, e4m3, + w1_alpha (torch.Tensor): (l,) + w2 (torch.Tensor): fp4 weights, [l, k, n // 2], uint8 + a2_global_scale (torch.Tensor): (l,) + w2_blockscale (torch.Tensor): blockscale factors, e4m3, + w2_alpha (torch.Tensor): (l,) + masked_m (torch.Tensor): Masked dimension indices + workspace (torch.Tensor): For gateup_output + + Notes: + - Assumes max(masked_m) <= m. + """ + + # === Assertions on dtypes === + assert input_global_scale.dtype == torch.float32, ( + f"input_global_scale must be float32, got {input_global_scale.dtype}" + ) + assert w1.dtype == torch.uint8, f"w1 must be uint8, got {w1.dtype}" + assert w1_blockscale.dtype == torch.float8_e4m3fn, ( + f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}" + ) + assert w1_alpha.dtype == torch.float32, ( + f"w1_alpha must be float32, got {w1_alpha.dtype}" + ) + assert w2.dtype == torch.uint8, f"w2 must be uint8, got {w2.dtype}" + assert a2_global_scale.dtype == torch.float32, ( + f"a2_global_scale must be float32, got {a2_global_scale.dtype}" + ) + assert w2_blockscale.dtype == torch.float8_e4m3fn, ( + f"w2_blockscale must be float8_e4m3fn, got {w2_blockscale.dtype}" + ) + assert w2_alpha.dtype == torch.float32, ( + f"w2_alpha must be float32, got {w2_alpha.dtype}" + ) + + # === Assertions on shapes === + n = w2.shape[-1] * 2 # intermediate dimension + num_experts, m, k = hidden_states.shape + + assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}" + assert w1.shape[-1] * 2 == k, ( + f"w1 last dim * 2 must equal k, got {w1.shape[-1]} vs k={k}" + ) + assert w2.shape[-2:] == ( + k, + n // 2, + ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n // 2)}" + + assert input_global_scale.shape == (num_experts,), ( + f"input_global_scale must be (l,), got {input_global_scale.shape}" + ) + assert w1_alpha.shape == (num_experts,), ( + f"w1_alpha must be (l,), got {w1_alpha.shape}" + ) + assert a2_global_scale.shape == (num_experts,), ( + f"a2_global_scale must be (l,), got {a2_global_scale.shape}" + ) + assert w2_alpha.shape == (num_experts,), ( + f"w2_alpha must be (l,), got {w2_alpha.shape}" + ) + + aq, aq_sf = scaled_fp4_grouped_quantize( + hidden_states, + masked_m, + input_global_scale, + ) + + workspace = workspace.permute(1, 2, 0) # requirement of kernel + sf_vec_size = 16 + assert aq_sf.dtype == torch.float8_e4m3fn + assert aq.dtype == torch.uint8 + ab_dtype = "float4_e2m1fn" + sf_dtype = "float8_e4m3fn" + + c_dtype = get_cute_dtype(hidden_states) + + # Gemm1 + flashinfer_cutedsl_grouped_gemm_nt_masked( + (aq, aq_sf), + (w1.permute(1, 2, 0), w1_blockscale), + workspace, + masked_m, + ab_dtype=ab_dtype, + sf_dtype=sf_dtype, + c_dtype=c_dtype, + sf_vec_size=sf_vec_size, + alpha=w1_alpha.view(1, 1, num_experts), + alpha_dtype=get_cute_dtype(w1_alpha), + ) # in logical [m, n, l] + + # SILU and quantization + diq, diq_sf = silu_and_mul_scaled_nvfp4_experts_quantize( + workspace.permute(2, 0, 1), + masked_m, + a2_global_scale, + ) + + # Gemm2 + out = out.permute(1, 2, 0) # requirement of kernel + flashinfer_cutedsl_grouped_gemm_nt_masked( + (diq, diq_sf), + (w2.permute(1, 2, 0), w2_blockscale), + out, + masked_m, + ab_dtype=ab_dtype, + sf_dtype=sf_dtype, + c_dtype=c_dtype, + sf_vec_size=sf_vec_size, + alpha=w2_alpha.view(1, 1, num_experts), + alpha_dtype=get_cute_dtype(w2_alpha), + ) # in logical [m, k, l] + out = out.permute(2, 0, 1) + + +def flashinfer_cutedsl_moe_fp4( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + quant_config: FusedMoEQuantConfig, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, +) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + create_flashinfer_prepare_finalize, + ) + + fused_experts = mk.FusedMoEModularKernel( + create_flashinfer_prepare_finalize(use_dp=False), # could be swapped later + FlashInferCuteDSLExperts( + out_dtype=hidden_states.dtype, + quant_config=quant_config, + ), + ) + + return fused_experts( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=inplace, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 38ab7cd4f115c..f684c17452a9b 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1468,7 +1468,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): gemm1_weight = layer.w13_weight.data gemm1_weight_scale = layer.w13_weight_scale.data - if self.allow_flashinfer: + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS + ): gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1( gemm1_weight, gemm1_weight_scale, dim=-2 ) @@ -1746,17 +1749,26 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): workspace=layer.workspace, ) - elif ( - self.allow_flashinfer - and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS - ): - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - flashinfer_cutlass_moe_fp4, + elif self.allow_flashinfer: + assert self.flashinfer_moe_backend in ( + FlashinferMoeBackend.CUTLASS, + FlashinferMoeBackend.CUTEDSL, ) + if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 + flashinfer_cutlass_moe_fp4, + ) + + flashinfer_fn_moe_fp4 = flashinfer_cutlass_moe_fp4 + else: + from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( # noqa: E501 + flashinfer_cutedsl_moe_fp4, + ) + + flashinfer_fn_moe_fp4 = flashinfer_cutedsl_moe_fp4 assert self.moe_quant_config is not None - - return flashinfer_cutlass_moe_fp4( + return flashinfer_fn_moe_fp4( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index fdf330329e20c..36e8599dd9484 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -10,6 +10,9 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( + FlashInferCuteDSLExperts, +) from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( FlashInferExperts, ) @@ -17,10 +20,14 @@ from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize im create_flashinfer_prepare_finalize, ) from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.flashinfer import ( + has_flashinfer_cutedsl_grouped_gemm_nt_masked, + has_flashinfer_cutlass_fused_moe, +) __all__ = [ "is_flashinfer_fp4_cutlass_moe_available", + "is_flashinfer_fp4_cutedsl_moe_available", "reorder_w1w3_to_w3w1", "build_flashinfer_fp4_cutlass_moe_prepare_finalize", ] @@ -36,6 +43,16 @@ def is_flashinfer_fp4_cutlass_moe_available() -> bool: ) +def is_flashinfer_fp4_cutedsl_moe_available() -> bool: + """Return ``True`` when FlashInfer CUTEDSL NV-FP4 kernels can be used.""" + return ( + envs.VLLM_USE_FLASHINFER_MOE_FP4 + and has_flashinfer_cutedsl_grouped_gemm_nt_masked() + and current_platform.is_cuda() + and current_platform.is_device_capability(100) + ) + + def reorder_w1w3_to_w3w1( weight: torch.Tensor, scale: torch.Tensor, dim: int = -2 ) -> tuple[torch.Tensor, torch.Tensor]: @@ -72,15 +89,21 @@ def select_nvfp4_gemm_impl( """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" if allow_flashinfer: - return FlashInferExperts( - out_dtype=moe.in_dtype, - quant_config=moe_quant_config, - ep_rank=moe.moe_parallel_config.ep_rank, - ep_size=moe.moe_parallel_config.ep_size, - tp_rank=moe.moe_parallel_config.tp_rank, - tp_size=moe.moe_parallel_config.tp_size, - use_dp=moe.moe_parallel_config.dp_size > 1, - ) + if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm": + return FlashInferCuteDSLExperts( + out_dtype=moe.in_dtype, + quant_config=moe_quant_config, + ) + elif envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput": + return FlashInferExperts( + out_dtype=moe.in_dtype, + quant_config=moe_quant_config, + ep_rank=moe.moe_parallel_config.ep_rank, + ep_size=moe.moe_parallel_config.ep_size, + tp_rank=moe.moe_parallel_config.tp_rank, + tp_size=moe.moe_parallel_config.tp_size, + use_dp=moe.moe_parallel_config.dp_size > 1, + ) # native cutlass experts currently don't support DP; TP case won't call this raise ValueError( diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index f22e17945d1f6..7eba8359b92f6 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -25,6 +25,7 @@ logger = init_logger(__name__) class FlashinferMoeBackend(Enum): TENSORRT_LLM = "TensorRT-LLM" CUTLASS = "CUTLASS" + CUTEDSL = "CUTEDSL" def calculate_tile_tokens_dim(num_tokens, top_k, num_experts): @@ -273,19 +274,21 @@ def flashinfer_cutlass_moe_fp8( def get_flashinfer_moe_backend() -> FlashinferMoeBackend: - flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND - # Prefer CUTLASS on SM90 to cover both SM90/SM100 generations - if flashinfer_moe_backend == "throughput" or current_platform.is_device_capability( - 90 - ): - return FlashinferMoeBackend.CUTLASS - elif flashinfer_moe_backend == "latency": - return FlashinferMoeBackend.TENSORRT_LLM + backend_map = { + "throughput": FlashinferMoeBackend.CUTLASS, + "latency": FlashinferMoeBackend.TENSORRT_LLM, + "masked_gemm": FlashinferMoeBackend.CUTEDSL, + } + + flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND + if flashinfer_moe_backend in backend_map: + return backend_map[flashinfer_moe_backend] + elif current_platform.is_device_capability(90): + return FlashinferMoeBackend.CUTLASS - allowed_backends = ["throughput", "latency"] raise ValueError( - f"Unknown flashinfer moe backend: {flashinfer_moe_backend}" - f" expected one of {allowed_backends}" + f"Unknown flashinfer moe backend: {flashinfer_moe_backend!r}. " + f"Expected one of {list(backend_map.keys())}." ) diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py index c3f26cc774118..44c5b027daf4f 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py @@ -5,6 +5,7 @@ from dataclasses import dataclass import vllm.envs as envs from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + is_flashinfer_fp4_cutedsl_moe_available, is_flashinfer_fp4_cutlass_moe_available, ) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( @@ -32,7 +33,10 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support: """Detect platform support for NV-FP4 fused-MoE path""" cutlass_supported = cutlass_fp4_supported() - allow_flashinfer = cutlass_supported and is_flashinfer_fp4_cutlass_moe_available() + allow_flashinfer = cutlass_supported and ( + is_flashinfer_fp4_cutlass_moe_available() + or is_flashinfer_fp4_cutedsl_moe_available() + ) if allow_flashinfer: _logger.info_once( diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 1209d64901bf5..9f9976d52b4ae 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -114,7 +114,17 @@ flashinfer_trtllm_fp8_per_tensor_scale_moe = _lazy_import_wrapper( flashinfer_cutlass_fused_moe = _lazy_import_wrapper( "flashinfer.fused_moe", "cutlass_fused_moe" ) +flashinfer_cutedsl_grouped_gemm_nt_masked = _lazy_import_wrapper( + "flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked" +) flashinfer_fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize") +nvfp4_batched_quantize = _lazy_import_wrapper("flashinfer", "nvfp4_batched_quantize") +silu_and_mul_scaled_nvfp4_experts_quantize = _lazy_import_wrapper( + "flashinfer", "silu_and_mul_scaled_nvfp4_experts_quantize" +) +scaled_fp4_grouped_quantize = _lazy_import_wrapper( + "flashinfer", "scaled_fp4_grouped_quantize" +) nvfp4_block_scale_interleave = _lazy_import_wrapper( "flashinfer", "nvfp4_block_scale_interleave" ) @@ -166,6 +176,14 @@ def has_flashinfer_moe() -> bool: ) +@functools.cache +def has_flashinfer_cutedsl() -> bool: + """Return ``True`` if FlashInfer cutedsl module is available.""" + return ( + has_flashinfer() and importlib.util.find_spec("flashinfer.cute_dsl") is not None + ) + + @functools.cache def has_flashinfer_cutlass_fused_moe() -> bool: """Return `True` if FlashInfer CUTLASS fused MoE is available.""" @@ -187,6 +205,26 @@ def has_flashinfer_cutlass_fused_moe() -> bool: return True +@functools.cache +def has_flashinfer_cutedsl_grouped_gemm_nt_masked() -> bool: + """Return ``True`` if FlashInfer CUTLASS fused MoE is available.""" + if not has_flashinfer_cutedsl(): + return False + + # Check if all required functions are available + required_functions = [ + ("flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"), + ("flashinfer", "scaled_fp4_grouped_quantize"), + ("flashinfer", "silu_and_scaled_nvfp4_experts_quantize"), + ] + + for module_name, attr_name in required_functions: + mod = _get_submodule(module_name) + if not mod or not hasattr(mod, attr_name): + return False + return True + + @functools.cache def has_nvidia_artifactory() -> bool: """Return `True` if NVIDIA's artifactory is accessible. @@ -472,7 +510,10 @@ __all__ = [ "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", "flashinfer_cutlass_fused_moe", + "flashinfer_cutedsl_grouped_gemm_nt_masked", "flashinfer_fp4_quantize", + "silu_and_mul_scaled_nvfp4_experts_quantize", + "scaled_fp4_grouped_quantize", "nvfp4_block_scale_interleave", "trtllm_fp4_block_scale_moe", "autotune", @@ -480,6 +521,7 @@ __all__ = [ "has_flashinfer_comm", "has_flashinfer_all2all", "has_flashinfer_cutlass_fused_moe", + "has_flashinfer_cutedsl_grouped_gemm_nt_masked", "has_nvidia_artifactory", "supports_trtllm_attention", "can_use_trtllm_attention", From 88f5b19f0bc681c016eaaa17502d3bb4e2b59b51 Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Wed, 19 Nov 2025 16:30:04 -0500 Subject: [PATCH 036/249] [DeepSeek] Fix DeepSeek V3.2 Rope Embedding (#28968) Signed-off-by: Yongye Zhu --- vllm/model_executor/layers/mla.py | 6 +++++- vllm/model_executor/models/deepseek_v2.py | 14 ++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index c4c44b83ae6bf..6ebfa47a9dc3f 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -24,6 +24,7 @@ class MLAModules: q_b_proj: torch.nn.Module | None q_proj: torch.nn.Module | None indexer: torch.nn.Module | None + indexer_rotary_emb: torch.nn.Module | None is_sparse: bool topk_indices_buffer: torch.Tensor | None @@ -80,6 +81,7 @@ class MultiHeadLatentAttentionWrapper(CustomOp): self.rotary_emb = mla_modules.rotary_emb self.o_proj = mla_modules.o_proj self.indexer = mla_modules.indexer + self.indexer_rope_emb = mla_modules.indexer_rotary_emb self.is_sparse = mla_modules.is_sparse if self.indexer is not None: @@ -153,7 +155,9 @@ class MultiHeadLatentAttentionWrapper(CustomOp): ) if self.indexer and self.is_sparse: - _topk_indices = self.indexer(hidden_states, q_c, positions, self.rotary_emb) + _topk_indices = self.indexer( + hidden_states, q_c, positions, self.indexer_rope_emb + ) attn_out = self.mla_attn( q, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 6675b2133f386..c0ff621d84085 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -837,8 +837,8 @@ class Indexer(nn.Module): ) q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1)) - q = torch.cat([q_pe, q_nope], dim=-1) - k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1) + q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1) + k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1) # we only quant q here since k quant is fused with cache insertion q = q.view(-1, self.head_dim) @@ -987,6 +987,14 @@ class DeepseekV2MLAAttention(nn.Module): self.is_v32 = hasattr(config, "index_topk") if self.is_v32: + self.indexer_rope_emb = get_rope( + qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=True, + ) self.indexer = Indexer( vllm_config, config, @@ -998,6 +1006,7 @@ class DeepseekV2MLAAttention(nn.Module): f"{prefix}.indexer", ) else: + self.indexer_rope_emb = None self.indexer = None mla_modules = MLAModules( @@ -1015,6 +1024,7 @@ class DeepseekV2MLAAttention(nn.Module): q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None, q_proj=self.q_proj if self.q_lora_rank is None else None, indexer=self.indexer, + indexer_rotary_emb=self.indexer_rope_emb, is_sparse=self.is_v32, topk_indices_buffer=topk_indices_buffer, ) From 22e44ad589d951f440ef98141a2a6f9df97f6873 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Wed, 19 Nov 2025 15:31:33 -0600 Subject: [PATCH 037/249] [ROCm][CI] Fix Weight Loading With Multiple GPU Tests on ROCm (#28984) Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 5 ++--- tests/weight_loading/models-amd.txt | 3 +++ tests/weight_loading/models-large-amd.txt | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 tests/weight_loading/models-amd.txt create mode 100644 tests/weight_loading/models-large-amd.txt diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 0049f35403409..37c6bd4276722 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1323,7 +1323,7 @@ steps: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt - label: Weight Loading Multiple GPU Test - Large Models # optional mirror_hardwares: [amdexperimental] @@ -1331,13 +1331,12 @@ steps: # grade: Blocking working_dir: "/vllm-workspace/tests" num_gpus: 2 - gpu: a100 optional: true source_file_dependencies: - vllm/ - tests/weight_loading commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt - label: NixlConnector PD accuracy tests (Distributed) # 30min mirror_hardwares: [amdexperimental] diff --git a/tests/weight_loading/models-amd.txt b/tests/weight_loading/models-amd.txt new file mode 100644 index 0000000000000..e31e904c08af4 --- /dev/null +++ b/tests/weight_loading/models-amd.txt @@ -0,0 +1,3 @@ +fp8, amd/Meta-Llama-3.1-8B-Instruct-FP8-KV, main +None, amd/Llama-3.2-1B-Instruct-FP8-KV, main +fp8, amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV, main diff --git a/tests/weight_loading/models-large-amd.txt b/tests/weight_loading/models-large-amd.txt new file mode 100644 index 0000000000000..b6f5b4b16b37f --- /dev/null +++ b/tests/weight_loading/models-large-amd.txt @@ -0,0 +1,3 @@ +fp8, amd/Meta-Llama-3.1-70B-Instruct-FP8-KV, main +None, microsoft/phi-4, main +fp8, amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV, main From 8f4f77a7275ecac594f84bdb41b67c95cf3eb26d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 19 Nov 2025 16:43:54 -0500 Subject: [PATCH 038/249] [BugFix] Fix false assertion with spec-decode=[2,4,..] and TP>2 (#29036) Signed-off-by: Lucas Wilkinson --- vllm/config/compilation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index ca01cb3fb55d5..1c3ef502f0f45 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -921,7 +921,7 @@ class CompilationConfig: self, uniform_decode_query_len: int, tensor_parallel_size: int ): multiple_of = uniform_decode_query_len - if tensor_parallel_size > 1: + if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism: multiple_of = max(uniform_decode_query_len, tensor_parallel_size) if ( multiple_of % uniform_decode_query_len != 0 From cb0a7b4bea26657da989562a10055b7d0b59fd3a Mon Sep 17 00:00:00 2001 From: Max Hu Date: Wed, 19 Nov 2025 16:54:15 -0500 Subject: [PATCH 039/249] [Bugfix] Move flashinfer kernel check into ```__init__``` function of ```FusedMoE``` (#29018) Signed-off-by: Max Hu --- vllm/model_executor/layers/fused_moe/layer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7b15e63e9e350..be1910266c878 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -574,6 +574,9 @@ class FusedMoE(CustomOp): is_act_and_mul=is_act_and_mul, is_lora_enabled=vllm_config.lora_config is not None, ) + self.moe_config_use_flashinfer_cutlass_kernels = ( + self.moe_config.use_flashinfer_cutlass_kernels + ) self.quant_config = quant_config @@ -728,7 +731,7 @@ class FusedMoE(CustomOp): return ( self.moe_quant_config is not None and self.moe_quant_config.quant_dtype == "nvfp4" - and self.moe_config.use_flashinfer_cutlass_kernels + and self.moe_config_use_flashinfer_cutlass_kernels ) @property From 0075bfffd4201d1377f0d048848f82911e917639 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:22:43 -0500 Subject: [PATCH 040/249] [CI] Fix precommit `rope_theta` issue (#29040) Signed-off-by: yewentao256 --- vllm/model_executor/models/deepseek_v2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index c0ff621d84085..c50fc327e7608 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -991,8 +991,7 @@ class DeepseekV2MLAAttention(nn.Module): qk_rope_head_dim, rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=config.rope_parameters, is_neox_style=True, ) self.indexer = Indexer( From 8e38e998298364b0a94cddf7ccc59d8466c2396a Mon Sep 17 00:00:00 2001 From: JartX Date: Thu, 20 Nov 2025 00:30:08 +0100 Subject: [PATCH 041/249] [Feature] EPLB on Qwen3VLMoe and CompressedTensorsWNA16MoEMethod (#28849) --- .../compressed_tensors_moe.py | 27 +++++++- vllm/model_executor/models/qwen3_vl_moe.py | 62 +++++++++++++++++-- 2 files changed, 82 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 22b3c477f420f..fa254030a271a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1921,9 +1921,20 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet." - ) + if expert_load_view is None: + raise ValueError("enable_eplb=True requiere expert_load_view != None") + if logical_to_physical_map is None: + raise ValueError( + "enable_eplb=True requiere logical_to_physical_map != None" + ) + if logical_replica_count is None: + raise ValueError( + "enable_eplb=True requiere logical_replica_count != None" + ) + if not isinstance(layer, FusedMoE): + raise TypeError( + "EPLB is only supported when `layer` is a instance of FusedMoE." + ) from vllm.model_executor.layers.fused_moe import fused_experts @@ -1940,6 +1951,12 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, indices_type=self.topk_indices_dtype, + num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0), + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, ) return fused_experts( @@ -1956,6 +1973,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): quant_config=self.moe_quant_config, ) + @property + def supports_eplb(self) -> bool: + return True + class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): """ diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 5c3205faf9c2f..e2c129120b1a5 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -15,7 +15,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -29,7 +29,9 @@ from collections.abc import Callable, Iterable from itertools import islice import torch -from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig +from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import ( + Qwen3VLMoeConfig, +) from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig @@ -44,7 +46,12 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import IntermediateTensors -from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel +from .interfaces import MixtureOfExperts +from .qwen3_moe import ( + Qwen3MoeForCausalLM, + Qwen3MoeModel, + Qwen3MoeSparseMoeBlock, +) from .qwen3_vl import ( Qwen3_VisionTransformer, Qwen3VLDummyInputsBuilder, @@ -344,12 +351,56 @@ class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM): ) +class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts): + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for layer in self.language_model.model.layers: + if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + def set_moe_parameters(self): + self.expert_weights = [] + + self.moe_layers = [] + example_moe = None + for layer in self.language_model.model.layers: + if hasattr(layer, "mlp") and isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + raise RuntimeError("No Qwen3Moe layer found in the language_model.") + + # Set MoE hyperparameters + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + @MULTIMODAL_REGISTRY.register_processor( Qwen3VLMultiModalProcessor, info=Qwen3VLMoeProcessingInfo, dummy_inputs=Qwen3VLDummyInputsBuilder, ) -class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): +class Qwen3VLMoeForConditionalGeneration( + Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts +): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -413,3 +464,6 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): self.deepstack_input_embeds = None self.visual_dim = config.vision_config.out_hidden_size self.multiscale_dim = self.visual_dim * self.deepstack_num_level + + # Set MoE hyperparameters + self.set_moe_parameters() From 3aaa94ac99f4b295ba95f14b4968620b2127044f Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com> Date: Wed, 19 Nov 2025 18:47:13 -0500 Subject: [PATCH 042/249] [Performance] Reduce DeepGEMM N dim restriction from 128 to 64 multiplier (#28687) Signed-off-by: Alexander Matveev Signed-off-by: mgoin Co-authored-by: mgoin --- .buildkite/test-pipeline.yaml | 20 ++++++++++++++++++++ tests/kernels/quantization/test_block_fp8.py | 11 +++++++---- vllm/utils/deep_gemm.py | 11 +++++++++-- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5309581d8e81f..71249a9543c7c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -550,6 +550,26 @@ steps: commands: - pytest -v -s kernels/mamba +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + optional: true + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s tests/kernels/moe/test_deepgemm.py + - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py + - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py + - label: Model Executor Test # 23min timeout_in_minutes: 35 torch_nightly: true diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index e9973c1fcc15e..d0e4f6554a91f 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -22,6 +22,7 @@ from vllm.utils.deep_gemm import ( fp8_gemm_nt, get_col_major_tma_aligned_tensor, per_block_cast_to_fp8, + should_use_deepgemm_for_fp8_linear, ) from vllm.utils.import_utils import has_deep_gemm @@ -157,10 +158,6 @@ def test_w8a8_block_fp8_cutlass_matmul(): @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.") @torch.inference_mode() def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): - # only aligned sizes - if M % 4 != 0 or K % 128 != 0 or N % 64 != 0: - pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}") - torch.manual_seed(seed) fp8_info = torch.finfo(torch.float8_e4m3fn) fp8_max = fp8_info.max @@ -168,6 +165,12 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max + # only aligned sizes are supported by deepgemm + if not should_use_deepgemm_for_fp8_linear( + output_dtype=out_dtype, weight=B_fp32, supports_deep_gemm=True + ): + pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}") + A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1]) B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index b5ab37534dd78..6b0a383a0e28c 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -365,11 +365,18 @@ def should_use_deepgemm_for_fp8_linear( ): if supports_deep_gemm is None: supports_deep_gemm = is_deep_gemm_supported() + + # Verify DeepGEMM N/K dims requirements + # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul + # test inside kernels/quatization/test_block_fp8.py + N_MULTIPLE = 64 + K_MULTIPLE = 128 + return ( supports_deep_gemm and output_dtype == torch.bfloat16 - and weight.shape[0] % 128 == 0 - and weight.shape[1] % 128 == 0 + and weight.shape[0] % N_MULTIPLE == 0 + and weight.shape[1] % K_MULTIPLE == 0 ) From 5031cd5d55ad99e8f9b31dd0020a06b346f6e493 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 19 Nov 2025 18:53:15 -0500 Subject: [PATCH 043/249] [Refactor] Optimize `select_experts` (#28069) Signed-off-by: yewentao256 --- vllm/model_executor/layers/fused_moe/fused_moe.py | 5 ----- vllm/model_executor/layers/fused_moe/layer.py | 11 ++++------- vllm/model_executor/layers/quantization/modelopt.py | 2 +- vllm/model_executor/models/longcat_flash.py | 2 +- vllm/model_executor/models/openpangu.py | 2 +- 5 files changed, 7 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 2e042d85fcfcf..f44328418f1bc 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1246,7 +1246,6 @@ def eplb_map_to_physical_and_record( expert_load_view: torch.Tensor, logical_to_physical_map: torch.Tensor, logical_replica_count: torch.Tensor, - indices_type: torch.dtype | None = None, ) -> torch.Tensor: """ Map the logical expert ids to physical expert ids @@ -1260,7 +1259,6 @@ def eplb_map_to_physical_and_record( expert_load_view: The expert load view. logical_to_physical_map: The logical to physical map. logical_replica_count: The logical replica count. - indices_type: The indices type. Returns: The physical expert ids. @@ -1310,9 +1308,6 @@ def eplb_map_to_physical_and_record( index=topk_ids_flatten.long(), src=torch.ones_like(topk_ids_flatten).to(expert_load_view), ) - - if indices_type is not None: - topk_ids = topk_ids.to(dtype=indices_type) return topk_ids diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index be1910266c878..d9525a7439c3e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -68,7 +68,6 @@ else: expert_load_view: torch.Tensor, logical_to_physical_map: torch.Tensor, logical_replica_count: torch.Tensor, - indices_type: torch.dtype | None, ) -> torch.Tensor: # CPU fallback: no EPLB so just return as is return topk_ids @@ -1509,8 +1508,6 @@ class FusedMoE(CustomOp): routed_scaling_factor=routed_scaling_factor, e_score_correction_bias=e_score_correction_bias, ) - if indices_type is not None: - topk_ids = topk_ids.to(dtype=indices_type) elif e_score_correction_bias is not None: topk_weights, topk_ids = fused_topk_bias( hidden_states=hidden_states, @@ -1519,7 +1516,7 @@ class FusedMoE(CustomOp): topk=top_k, renormalize=renormalize, ) - if routed_scaling_factor is not None: + if routed_scaling_factor != 1.0: topk_weights *= routed_scaling_factor elif custom_routing_function is None: topk_weights, topk_ids, token_expert_indices = fused_topk( @@ -1536,8 +1533,6 @@ class FusedMoE(CustomOp): topk=top_k, renormalize=renormalize, ) - if indices_type is not None: - topk_ids = topk_ids.to(dtype=indices_type) if enable_eplb: assert expert_load_view is not None @@ -1549,9 +1544,11 @@ class FusedMoE(CustomOp): expert_load_view=expert_load_view, logical_to_physical_map=logical_to_physical_map, logical_replica_count=logical_replica_count, - indices_type=indices_type, ) + if (indices_type is not None) and topk_ids.dtype != indices_type: + topk_ids = topk_ids.to(dtype=indices_type) + assert topk_ids.dtype == indices_type or indices_type is None # Compute zero expert result if needed diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index f684c17452a9b..dedab33c1bdb7 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1706,7 +1706,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): intermediate_size=layer.intermediate_size_per_partition, local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, - routed_scaling_factor=None, + routed_scaling_factor=1.0, tile_tokens_dim=None, routing_method_type=routing_method_type, do_finalize=True, diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py index fafe97cd2be7e..c5441283f9711 100644 --- a/vllm/model_executor/models/longcat_flash.py +++ b/vllm/model_executor/models/longcat_flash.py @@ -118,7 +118,7 @@ class FlashConfig(PretrainedConfig): router_dtype="float32", router_bias=False, topk_method=None, - routed_scaling_factor=None, + routed_scaling_factor=1.0, zero_expert_num=0, zero_expert_type=None, nextn_use_scmoe=False, diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index f814cdfec5a22..4124a181a14c2 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -625,7 +625,7 @@ class OpenPanguDecoderLayer(nn.Module): bias=getattr(config, "mlp_bias", False), prefix=f"{prefix}.mlp", ) - self.routed_scaling_factor = getattr(config, "routed_scaling_factor", None) + self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0) self.num_hidden_layers = config.num_hidden_layers self.first_k_dense_replace = getattr( config, "first_k_dense_replace", self.num_hidden_layers From 537cc635c77ac63f643c5289137debdd8f9591ac Mon Sep 17 00:00:00 2001 From: Jialin Ouyang Date: Wed, 19 Nov 2025 16:10:22 -0800 Subject: [PATCH 044/249] [GC Debugger] Simply and improve GC Debugger Utils (#29029) Signed-off-by: Jialin Ouyang --- vllm/utils/gc_utils.py | 7 ++++--- vllm/v1/engine/core.py | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py index 160ac9ac263a9..3436e450a269f 100644 --- a/vllm/utils/gc_utils.py +++ b/vllm/utils/gc_utils.py @@ -68,9 +68,10 @@ class GCDebugger: # Before GC started, record GC start time # and top collected objects self.start_time_ns = time.monotonic_ns() - self.gc_top_collected_objects = _compute_top_gc_collected_objects( - gc.get_objects(generation), self.config.top_objects - ) + if (top_objects := self.config.top_objects) > 0: + self.gc_top_collected_objects = _compute_top_gc_collected_objects( + gc.get_objects(generation), top_objects + ) elif phase == "stop": # After GC finished, Record GC elapsed time and # optionally top collected objects diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6be19894d332a..8657a95b5e6e7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -206,6 +206,8 @@ class EngineCore: # Mark the startup heap as static so that it's ignored by GC. # Reduces pause times of oldest generation collections. freeze_gc_heap() + # If enable, attach GC debugger after static variable freeze. + maybe_attach_gc_debug_callback() def _initialize_kv_caches( self, vllm_config: VllmConfig @@ -645,9 +647,6 @@ class EngineCoreProc(EngineCore): assert addresses.coordinator_input is not None logger.info("Waiting for READY message from DP Coordinator...") - # If enable, attach GC debugger after static variable freeze. - maybe_attach_gc_debug_callback() - # Enable environment variable cache (e.g. assume no more # environment variable overrides after this point) enable_envs_cache() From 9ccef8e333ccd988a587990740405503e76c8c20 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 19 Nov 2025 16:26:04 -0800 Subject: [PATCH 045/249] [Misc] Colorize logs (#29017) Signed-off-by: Nick Hill --- tests/test_logger.py | 94 ++++++++++++++++++--------------- vllm/envs.py | 9 ++++ vllm/logger.py | 51 ++++++++++++------ vllm/logging_utils/__init__.py | 3 +- vllm/logging_utils/formatter.py | 50 ++++++++++++++++++ vllm/utils/system_utils.py | 7 ++- 6 files changed, 152 insertions(+), 62 deletions(-) diff --git a/tests/test_logger.py b/tests/test_logger.py index 01672358902f9..8900e9c2a1e69 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -49,10 +49,13 @@ def test_trace_function_call(): os.remove(path) -def test_default_vllm_root_logger_configuration(): +def test_default_vllm_root_logger_configuration(monkeypatch): """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default behavior is activated.""" + monkeypatch.setenv("VLLM_LOGGING_COLOR", "0") + _configure_vllm_root_logger() + logger = logging.getLogger("vllm") assert logger.level == logging.DEBUG assert not logger.propagate @@ -70,12 +73,13 @@ def test_default_vllm_root_logger_configuration(): assert formatter.datefmt == _DATE_FORMAT -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) -@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None) -def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(): +def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(monkeypatch): """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default behavior is activated.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False) + root_logger = logging.getLogger("vllm") root_handler = root_logger.handlers[0] @@ -99,49 +103,50 @@ def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(): assert log_record.levelno == logging.INFO -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0) -@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None) -def test_logger_configuring_can_be_disabled(): +def test_logger_configuring_can_be_disabled(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however mocks are used to ensure no changes in behavior or configuration occur.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0") + monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False) with patch("vllm.logger.dictConfig") as dict_config_mock: _configure_vllm_root_logger() dict_config_mock.assert_not_called() -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) -@patch( - "vllm.logger.VLLM_LOGGING_CONFIG_PATH", - "/if/there/is/a/file/here/then/you/did/this/to/yourself.json", -) -def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(): +def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however it fails before any change in behavior or configuration occurs.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + monkeypatch.setenv( + "VLLM_LOGGING_CONFIG_PATH", + "/if/there/is/a/file/here/then/you/did/this/to/yourself.json", + ) + with pytest.raises(RuntimeError) as ex_info: _configure_vllm_root_logger() assert ex_info.type == RuntimeError # noqa: E721 assert "File does not exist" in str(ex_info) -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) -def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(): +def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however it fails before any change in behavior or configuration occurs.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file: logging_config_file.write("---\nloggers: []\nversion: 1") logging_config_file.flush() - with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name): - with pytest.raises(JSONDecodeError) as ex_info: - _configure_vllm_root_logger() - assert ex_info.type == JSONDecodeError - assert "Expecting value" in str(ex_info) + monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name) + with pytest.raises(JSONDecodeError) as ex_info: + _configure_vllm_root_logger() + assert ex_info.type == JSONDecodeError + assert "Expecting value" in str(ex_info) -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) @pytest.mark.parametrize( "unexpected_config", ( @@ -151,26 +156,30 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(): ), ) def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json( + monkeypatch, unexpected_config: Any, ): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however it fails before any change in behavior or configuration occurs.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file: logging_config_file.write(json.dumps(unexpected_config)) logging_config_file.flush() - with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name): - with pytest.raises(ValueError) as ex_info: - _configure_vllm_root_logger() - assert ex_info.type == ValueError # noqa: E721 - assert "Invalid logging config. Expected dict, got" in str(ex_info) + monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name) + with pytest.raises(ValueError) as ex_info: + _configure_vllm_root_logger() + assert ex_info.type == ValueError # noqa: E721 + assert "Invalid logging config. Expected dict, got" in str(ex_info) -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1) -def test_custom_logging_config_is_parsed_and_used_when_provided(): +def test_custom_logging_config_is_parsed_and_used_when_provided(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however mocks are used to ensure no changes in behavior or configuration occur.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1") + valid_logging_config = { "loggers": { "vllm.test_logger.logger": { @@ -183,19 +192,18 @@ def test_custom_logging_config_is_parsed_and_used_when_provided(): with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file: logging_config_file.write(json.dumps(valid_logging_config)) logging_config_file.flush() - with ( - patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name), - patch("vllm.logger.dictConfig") as dict_config_mock, - ): + monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name) + with patch("vllm.logger.dictConfig") as dict_config_mock: _configure_vllm_root_logger() dict_config_mock.assert_called_with(valid_logging_config) -@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0) -def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(): +def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(monkeypatch): """This test calls _configure_vllm_root_logger again to test custom logging config behavior, however mocks are used to ensure no changes in behavior or configuration occur.""" + monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0") + valid_logging_config = { "loggers": { "vllm.test_logger.logger": { @@ -207,15 +215,15 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(): with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file: logging_config_file.write(json.dumps(valid_logging_config)) logging_config_file.flush() - with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name): - with pytest.raises(RuntimeError) as ex_info: - _configure_vllm_root_logger() - assert ex_info.type is RuntimeError - expected_message_snippet = ( - "VLLM_CONFIGURE_LOGGING evaluated to false, but " - "VLLM_LOGGING_CONFIG_PATH was given." - ) - assert expected_message_snippet in str(ex_info) + monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name) + with pytest.raises(RuntimeError) as ex_info: + _configure_vllm_root_logger() + assert ex_info.type is RuntimeError + expected_message_snippet = ( + "VLLM_CONFIGURE_LOGGING evaluated to false, but " + "VLLM_LOGGING_CONFIG_PATH was given." + ) + assert expected_message_snippet in str(ex_info) # Remember! The root logger is assumed to have been configured as # though VLLM_CONFIGURE_LOGGING=1 and VLLM_LOGGING_CONFIG_PATH=None. diff --git a/vllm/envs.py b/vllm/envs.py index 1ff620af57229..614bc94b978bd 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -42,6 +42,8 @@ if TYPE_CHECKING: VLLM_LOGGING_PREFIX: str = "" VLLM_LOGGING_STREAM: str = "ext://sys.stdout" VLLM_LOGGING_CONFIG_PATH: str | None = None + VLLM_LOGGING_COLOR: str = "auto" + NO_COLOR: bool = False VLLM_LOG_STATS_INTERVAL: float = 10.0 VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: str | None = None @@ -616,6 +618,11 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LOGGING_STREAM": lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"), # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), + # Controls colored logging output. Options: "auto" (default, colors when terminal), + # "1" (always use colors), "0" (never use colors) + "VLLM_LOGGING_COLOR": lambda: os.getenv("VLLM_LOGGING_COLOR", "auto"), + # Standard unix flag for disabling ANSI color codes + "NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0", # If set, vllm will log stats at this interval in seconds # If not set, vllm will log stats every 10 seconds. "VLLM_LOG_STATS_INTERVAL": lambda: val @@ -1578,6 +1585,7 @@ def compile_factors() -> dict[str, object]: "VLLM_LOGGING_PREFIX", "VLLM_LOGGING_STREAM", "VLLM_LOGGING_CONFIG_PATH", + "VLLM_LOGGING_COLOR", "VLLM_LOG_STATS_INTERVAL", "VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "VLLM_TUNED_CONFIG_FOLDER", @@ -1608,6 +1616,7 @@ def compile_factors() -> dict[str, object]: "VLLM_TEST_FORCE_LOAD_FORMAT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES", + "NO_COLOR", } from vllm.config.utils import normalize_value diff --git a/vllm/logger.py b/vllm/logger.py index 9341008296843..772e36497b45e 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -17,18 +17,25 @@ from typing import Any, Literal, cast import vllm.envs as envs -VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING -VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH -VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL -VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX -VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM - _FORMAT = ( - f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " + f"{envs.VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " "[%(fileinfo)s:%(lineno)d] %(message)s" ) _DATE_FORMAT = "%m-%d %H:%M:%S" + +def _use_color() -> bool: + if envs.NO_COLOR or envs.VLLM_LOGGING_COLOR == "0": + return False + if envs.VLLM_LOGGING_COLOR == "1": + return True + if envs.VLLM_LOGGING_STREAM == "ext://sys.stdout": # stdout + return hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + elif envs.VLLM_LOGGING_STREAM == "ext://sys.stderr": # stderr + return hasattr(sys.stderr, "isatty") and sys.stderr.isatty() + return False + + DEFAULT_LOGGING_CONFIG = { "formatters": { "vllm": { @@ -36,13 +43,19 @@ DEFAULT_LOGGING_CONFIG = { "datefmt": _DATE_FORMAT, "format": _FORMAT, }, + "vllm_color": { + "class": "vllm.logging_utils.ColoredFormatter", + "datefmt": _DATE_FORMAT, + "format": _FORMAT, + }, }, "handlers": { "vllm": { "class": "logging.StreamHandler", - "formatter": "vllm", - "level": VLLM_LOGGING_LEVEL, - "stream": VLLM_LOGGING_STREAM, + # Choose formatter based on color setting. + "formatter": "vllm_color" if _use_color() else "vllm", + "level": envs.VLLM_LOGGING_LEVEL, + "stream": envs.VLLM_LOGGING_STREAM, }, }, "loggers": { @@ -144,7 +157,7 @@ _METHODS_TO_PATCH = { def _configure_vllm_root_logger() -> None: logging_config = dict[str, Any]() - if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH: + if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH: raise RuntimeError( "VLLM_CONFIGURE_LOGGING evaluated to false, but " "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH " @@ -152,16 +165,22 @@ def _configure_vllm_root_logger() -> None: "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH." ) - if VLLM_CONFIGURE_LOGGING: + if envs.VLLM_CONFIGURE_LOGGING: logging_config = DEFAULT_LOGGING_CONFIG - if VLLM_LOGGING_CONFIG_PATH: - if not path.exists(VLLM_LOGGING_CONFIG_PATH): + vllm_handler = logging_config["handlers"]["vllm"] + # Refresh these values in case env vars have changed. + vllm_handler["level"] = envs.VLLM_LOGGING_LEVEL + vllm_handler["stream"] = envs.VLLM_LOGGING_STREAM + vllm_handler["formatter"] = "vllm_color" if _use_color() else "vllm" + + if envs.VLLM_LOGGING_CONFIG_PATH: + if not path.exists(envs.VLLM_LOGGING_CONFIG_PATH): raise RuntimeError( "Could not load logging config. File does not exist: %s", - VLLM_LOGGING_CONFIG_PATH, + envs.VLLM_LOGGING_CONFIG_PATH, ) - with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: + with open(envs.VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: custom_config = json.loads(file.read()) if not isinstance(custom_config, dict): diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index 44b40ead973ba..8d3354df215b1 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -1,12 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.logging_utils.formatter import NewLineFormatter +from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter from vllm.logging_utils.lazy import lazy from vllm.logging_utils.log_time import logtime __all__ = [ "NewLineFormatter", + "ColoredFormatter", "lazy", "logtime", ] diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py index 02ba308e18796..3ad4ef8d119ad 100644 --- a/vllm/logging_utils/formatter.py +++ b/vllm/logging_utils/formatter.py @@ -75,3 +75,53 @@ class NewLineFormatter(logging.Formatter): parts = msg.split(record.message) msg = msg.replace("\n", "\r\n" + parts[0]) return msg + + +class ColoredFormatter(NewLineFormatter): + """Adds ANSI color codes to log levels for terminal output. + + This formatter adds colors by injecting them into the format string for + static elements (timestamp, filename, line number) and modifying the + levelname attribute for dynamic color selection. + """ + + # ANSI color codes + COLORS = { + "DEBUG": "\033[37m", # White + "INFO": "\033[32m", # Green + "WARNING": "\033[33m", # Yellow + "ERROR": "\033[31m", # Red + "CRITICAL": "\033[35m", # Magenta + } + GREY = "\033[90m" # Grey for timestamp and file info + RESET = "\033[0m" + + def __init__(self, fmt, datefmt=None, style="%"): + # Inject grey color codes into format string for timestamp and file info + if fmt: + # Wrap %(asctime)s with grey + fmt = fmt.replace("%(asctime)s", f"{self.GREY}%(asctime)s{self.RESET}") + # Wrap [%(fileinfo)s:%(lineno)d] with grey + fmt = fmt.replace( + "[%(fileinfo)s:%(lineno)d]", + f"{self.GREY}[%(fileinfo)s:%(lineno)d]{self.RESET}", + ) + + # Call parent __init__ with potentially modified format string + super().__init__(fmt, datefmt, style) + + def format(self, record): + # Store original levelname to restore later (in case record is reused) + orig_levelname = record.levelname + + # Only modify levelname - it needs dynamic color based on severity + if (color_code := self.COLORS.get(record.levelname)) is not None: + record.levelname = f"{color_code}{record.levelname}{self.RESET}" + + # Call parent format which will handle everything else + msg = super().format(record) + + # Restore original levelname + record.levelname = orig_levelname + + return msg diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py index 5968884e232a4..cc872040b6c5f 100644 --- a/vllm/utils/system_utils.py +++ b/vllm/utils/system_utils.py @@ -22,7 +22,7 @@ from .platform_utils import cuda_is_initialized, xpu_is_initialized logger = init_logger(__name__) -CYAN = "\033[1;36m" +CYAN = "\033[0;36m" RESET = "\033[0;0m" @@ -142,7 +142,10 @@ def set_process_title( def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: """Add colored prefix to file output for log decoration.""" - prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " + if envs.NO_COLOR: + prefix = f"({worker_name} pid={pid}) " + else: + prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " file_write = file.write def write_with_prefix(s: str): From 1d642872a27f1c6bedf28669642928cc7eec6532 Mon Sep 17 00:00:00 2001 From: liangel-02 Date: Wed, 19 Nov 2025 19:39:45 -0500 Subject: [PATCH 046/249] [torchao] fix safetensors for sharding (#28169) Signed-off-by: Angel Li --- tests/quantization/test_torchao.py | 9 ++++---- .../model_loader/default_loader.py | 2 +- .../model_loader/weight_utils.py | 23 +++++++++++++++---- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index fb8d6130c3779..f35c3973ab6e6 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -225,13 +225,12 @@ def test_reload_weights(): @pytest.mark.skip( reason="since torchao nightly is only compatible with torch nightly" "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip " - "torchao tests that requires newer versions (0.14.0.dev+) for now" + "torchao tests that requires newer versions (0.15.0.dev+) for now" ) -def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_runner): +def test_safetensors_model_loading_with_params(vllm_runner): torch._dynamo.reset() - model_name = ( - "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors" - ) + # using this model to test safetensors loading with file sharding + model_name = "torchao-testing/Qwen3-8B-INT4-0.15.0dev-safetensors" with vllm_runner(model_name=model_name, dtype="bfloat16") as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=4) diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index b80026741781f..67aa584c6bda2 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -279,7 +279,7 @@ class DefaultModelLoader(BaseModelLoader): if ( hasattr(quant_config, "is_checkpoint_torchao_serialized") and quant_config.is_checkpoint_torchao_serialized - and torchao_version_at_least("0.14.0") + and torchao_version_at_least("0.15.0") ): self.load_config.safetensors_load_strategy = "torchao" diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 89634cbf41241..4572ebe2ea11b 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -595,6 +595,9 @@ def safetensors_weights_iterator( if safetensors_load_strategy == "eager": loading_desc += " (eager)" + state_dict = {} + leftover_state_dict: dict[str, torch.Tensor] = {} + for st_file in tqdm( hf_weights_files, desc=loading_desc, @@ -606,9 +609,11 @@ def safetensors_weights_iterator( state_dict = load(f.read()) yield from state_dict.items() elif safetensors_load_strategy == "torchao": - if not torchao_version_at_least("0.14.0"): + # we can't load flattened torchao tensor subclasses directly into the model + # instead we reconstruct the subclasses here before returning + if not torchao_version_at_least("0.15.0"): raise ValueError( - "Please use torchao version >= 0.14.0 \ + "Please use torchao version >= 0.15.0 \ to load torchao safetensors checkpoint" ) from torchao.prototype.safetensors.safetensors_support import ( @@ -616,12 +621,20 @@ def safetensors_weights_iterator( ) with safe_open(st_file, framework="pt") as f: - state_dict = {} for name in f.keys(): # noqa: SIM118 state_dict[name] = f.get_tensor(name) + + # update with leftover tensor data from previous iteration, if any + state_dict.update(leftover_state_dict) metadata = f.metadata() - updated_state_dict = unflatten_tensor_state_dict(state_dict, metadata) - yield from updated_state_dict.items() + # due to sharded checkpoints, we are not guaranteed that we have all + # tensor subclass data on one file + # state_dict has the leftover data from this step and we wait for + # missing information to be provided in a future iteration + unflattened_state_dict, leftover_state_dict = ( + unflatten_tensor_state_dict(state_dict, metadata) + ) + yield from unflattened_state_dict.items() else: with safe_open(st_file, framework="pt") as f: for name in f.keys(): # noqa: SIM118 From 05c2dee7e9f485f1e76eee084849e07c1c12a68b Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Nov 2025 09:40:49 +0800 Subject: [PATCH 047/249] [DeepSeek + LMCache Multiprocess] handle MLA for deepseek model + LMCache Multiprocess connector (#29039) Signed-off-by: KuntaiDu --- .../kv_connector/v1/lmcache_mp_connector.py | 47 +++++++++++++++---- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 55831dc56c803..22ddabbf1e352 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Literal, Optional, cast import torch import zmq +from lmcache.integration.vllm.utils import mla_enabled from lmcache.utils import init_logger as lmcache_init_logger from vllm.config import VllmConfig @@ -60,17 +61,44 @@ def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]: return block_ids[0] +def extract_world_size_and_kv_rank( + world_size: int, + rank: int, + vllm_config: VllmConfig, +) -> tuple[int, int]: + """ + Convert the rank for the MLA. + """ + use_mla = mla_enabled(vllm_config.model_config) + if not use_mla: + return world_size, rank + else: + # Tensor parallel does not change the KV caches for MLA models. + # So we need to "exclude" the effect of TP on rank and world size + tp_size = vllm_config.parallel_config.tensor_parallel_size + # vLLM constructs TP groups first, and then construct other + # parallel groups on top of TP groups. + # for example, TP=4, PP=2, + # TP group: [0, 1, 2, 3], [4, 5, 6, 7] + # PP group: [0, 4], [1, 5], [2, 6], [3, 7] + # So we can "exclude" the effect of TP by rank // tp_size. + return world_size // tp_size, rank // tp_size + + def create_scheduler_adapter( server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig ) -> LMCacheMPSchedulerAdapter: - # TODO: have a helper function to calculate the correct rank and - # world size for the MLA and other models + world_size, kv_rank = extract_world_size_and_kv_rank( + vllm_config.parallel_config.world_size, + vllm_config.parallel_config.rank, + vllm_config, + ) return LMCacheMPSchedulerAdapter( server_url, zmq_context, vllm_config.model_config.model, - vllm_config.parallel_config.world_size, - vllm_config.parallel_config.rank, + world_size, + kv_rank, vllm_config.cache_config.block_size, ) @@ -78,14 +106,17 @@ def create_scheduler_adapter( def create_worker_adapter( server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig ) -> LMCacheMPWorkerAdapter: - # TODO: have a helper function to calculate the correct rank and - # world size for the MLA and other models + world_size, kv_rank = extract_world_size_and_kv_rank( + vllm_config.parallel_config.world_size, + vllm_config.parallel_config.rank, + vllm_config, + ) return LMCacheMPWorkerAdapter( server_url, zmq_context, vllm_config.model_config.model, - vllm_config.parallel_config.world_size, - vllm_config.parallel_config.rank, + world_size, + kv_rank, vllm_config.cache_config.block_size, ) From 3fb0d90999887949629d1e9bac4d98336a35c475 Mon Sep 17 00:00:00 2001 From: Qiang Zhang Date: Thu, 20 Nov 2025 10:11:52 +0800 Subject: [PATCH 048/249] [AMD] Use Decoupled Kernel Block Size to Support AITER MLA block_size=1 (#27715) Signed-off-by: chiangzhang --- vllm/attention/backends/abstract.py | 14 +++--- .../attention/backends/mla/rocm_aiter_mla.py | 45 +++---------------- 2 files changed, 13 insertions(+), 46 deletions(-) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index d28bc065852db..188becb6ad6f0 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -119,14 +119,12 @@ class AttentionBackend(ABC): return True for supported_size in cls.supported_kernel_block_sizes: - is_multiple_of = ( - isinstance(supported_size, MultipleOf) - and block_size % supported_size.base == 0 - ) - is_int_equal = ( - isinstance(supported_size, int) and block_size == supported_size - ) - if is_multiple_of or is_int_equal: + if isinstance(supported_size, MultipleOf): + supported_size = supported_size.base + # With hybrid_blocks feature, the framework-level block size + # only needs to be a multiple of the kernel's requirement, + # even if the kernel requires a fixed block_size. + if block_size % supported_size == 0: return True return False diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index e1864526f02cc..6ccc1a341d56c 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -7,9 +7,8 @@ from typing import ClassVar import torch from vllm._aiter_ops import rocm_aiter_ops -from vllm.attention.backends.abstract import AttentionLayer +from vllm.attention.backends.abstract import AttentionLayer, MultipleOf from vllm.config import VllmConfig -from vllm.utils.math_utils import cdiv from vllm.v1.attention.backends.mla.common import ( MLACommonBackend, MLACommonDecodeMetadata, @@ -22,6 +21,8 @@ from vllm.v1.kv_cache_interface import AttentionSpec class AiterMLABackend(MLACommonBackend): + supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [1] + @staticmethod def get_name() -> str: return "ROCM_AITER_MLA" @@ -71,9 +72,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): ) self.compilation_config = vllm_config.compilation_config - max_num_pages_per_req = cdiv( - vllm_config.model_config.max_model_len, self.kv_cache_spec.block_size - ) + # kernel block size is always 1. + max_num_pages_per_req = vllm_config.model_config.max_model_len max_num_reqs = vllm_config.scheduler_config.max_num_seqs max_num_pages = max_num_reqs * max_num_pages_per_req @@ -82,11 +82,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): # so we can only use the persistent buffer if a cudagraph is actually # being used. if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): - self.block_table_remapping = torch.zeros( - [max_num_reqs, max_num_pages_per_req * self.kv_cache_spec.block_size], - dtype=torch.int32, - device=device, - ) self.paged_kv_indptr = torch.zeros( max_num_reqs + 1, dtype=torch.int32, device=device ) @@ -111,36 +106,16 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): num_decode_tokens: int, dcp_tot_seq_lens_device: torch.Tensor | None, ) -> AiterMLADecodeMetadata: - page_size = self.kv_cache_spec.block_size + # kernel block size is always 1, although the kv block size is not 1. device = self.device num_reqs = seq_lens_device.size(0) - bs, _ = block_table_tensor.shape - block_table_tensor = ( - block_table_tensor.unsqueeze(-1).expand(-1, -1, page_size) * page_size - ) - block_table_tensor = ( - block_table_tensor - + torch.arange( - 0, - page_size, - device=block_table_tensor.device, - dtype=block_table_tensor.dtype, - )[None, None, :] - ) - block_table_tensor = block_table_tensor.view(bs, -1) - # after remapping, we assume the block size already equals to 1 - - max_blk_size_per_req = block_table_tensor.shape[-1] mask = torch.arange( block_table_tensor.size(1), dtype=block_table_tensor.dtype, device=device ).unsqueeze(0) < seq_lens_device.unsqueeze(1) paged_kv_indices = block_table_tensor[mask] - paged_kv_last_page_len = seq_lens_device % page_size - paged_kv_last_page_len = torch.where( - paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len - ) + paged_kv_last_page_len = torch.where(seq_lens_device == 0, 1, seq_lens_device) paged_kv_indptr = torch.cat( [ @@ -151,12 +126,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): num_actual_pages = paged_kv_indices.size(0) - self.block_table_remapping[:num_reqs, :max_blk_size_per_req].copy_( - block_table_tensor, non_blocking=True - ) - block_table_tensor = self.block_table_remapping[ - :num_reqs, :max_blk_size_per_req - ] self.paged_kv_indices[:num_actual_pages].copy_( paged_kv_indices, non_blocking=True From 3168285fcaaee09bc93dce7bc9ae6ee823c71652 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Thu, 20 Nov 2025 02:37:09 +0000 Subject: [PATCH 049/249] [cpu][ci] Add initial set of tests for Arm CPUs (#28657) Signed-off-by: Fadi Arafeh --- .../scripts/hardware_ci/run-cpu-test-arm.sh | 64 +++++++++++++++++++ docker/Dockerfile.cpu | 10 +++ 2 files changed, 74 insertions(+) create mode 100755 .buildkite/scripts/hardware_ci/run-cpu-test-arm.sh diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh new file mode 100755 index 0000000000000..d0036f24c8d04 --- /dev/null +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-0-16} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16} +NUMA_NODE=${NUMA_NODE:-0} + +export CMAKE_BUILD_PARALLEL_LEVEL=32 + +# Setup cleanup +remove_docker_container() { + set -e; + docker rm -f cpu-test-"$NUMA_NODE" || true; +} +trap remove_docker_container EXIT +remove_docker_container + +# Try building the docker image +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . + +# Run the image, setting --shm-size=4g for tensor parallel. +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" + +function cpu_tests() { + set -e + export NUMA_NODE=$2 + + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pip list" + + # offline inference + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + + # Run kernel tests + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -x -v -s tests/kernels/test_onednn.py + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" + + # basic online serving + docker exec cpu-test-"$NUMA_NODE" bash -c ' + set -e + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve meta-llama/Llama-3.2-3B-Instruct --max-model-len 2048 & + server_pid=$! + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + vllm bench serve \ + --backend vllm \ + --dataset-name random \ + --model meta-llama/Llama-3.2-3B-Instruct \ + --num-prompts 20 \ + --endpoint /v1/completions + kill -s SIGTERM $server_pid &' +} + +# All of CPU tests are expected to be finished less than 40 mins. +export -f cpu_tests +timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 4c961defaeda2..eb3807ef0ca4e 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && curl -LsSf https://astral.sh/uv/install.sh | sh +ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12 ENV CCACHE_DIR=/root/.cache/ccache ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache @@ -122,6 +123,15 @@ WORKDIR /workspace/vllm RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ cp requirements/test.in requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ + remove_packages_not_supported_on_aarch64() { \ + case "$(uname -m)" in \ + aarch64|arm64) \ + sed -i '/decord/d' requirements/cpu-test.in; \ + sed -i '/terratorch/d' requirements/cpu-test.in; \ + ;; \ + esac; \ + }; \ + remove_packages_not_supported_on_aarch64 && \ sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \ sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ From fcbcba6c70a3308705aa21adebb443bf9015b486 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Wed, 19 Nov 2025 22:17:48 -0500 Subject: [PATCH 050/249] [Feat] Iteration-level profiling for Torch and CUDA profiler (#28987) Signed-off-by: Benjamin Chislett Signed-off-by: Benjamin Chislett Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/v1/worker/test_gpu_profiler.py | 203 +++++++++++++++++++++++++ vllm/envs.py | 16 ++ vllm/profiler/gpu_profiler.py | 217 ++++++++++++++++++++++++--- vllm/v1/engine/async_llm.py | 14 +- vllm/v1/worker/gpu_worker.py | 50 ++---- 5 files changed, 437 insertions(+), 63 deletions(-) create mode 100644 tests/v1/worker/test_gpu_profiler.py diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py new file mode 100644 index 0000000000000..f7255fae05a4e --- /dev/null +++ b/tests/v1/worker/test_gpu_profiler.py @@ -0,0 +1,203 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +import vllm.envs as envs +from vllm.profiler.gpu_profiler import WorkerProfiler + + +class ConcreteWorkerProfiler(WorkerProfiler): + """ + A basic implementation of a worker profiler for testing purposes. + """ + + def __init__(self): + self.start_call_count = 0 + self.stop_call_count = 0 + self.should_fail_start = False + super().__init__() + + def _start(self) -> None: + if self.should_fail_start: + raise RuntimeError("Simulated start failure") + self.start_call_count += 1 + + def _stop(self) -> None: + self.stop_call_count += 1 + + +@pytest.fixture(autouse=True) +def reset_mocks(): + """Fixture to reset mocks and env variables before each test.""" + envs.VLLM_PROFILER_DELAY_ITERS = 0 + envs.VLLM_PROFILER_MAX_ITERS = 0 + + +def test_immediate_start_stop(): + """Test standard start without delay.""" + profiler = ConcreteWorkerProfiler() + + profiler.start() + assert profiler._running is True + assert profiler._active is True + assert profiler.start_call_count == 1 + + profiler.stop() + assert profiler._running is False + assert profiler._active is False + assert profiler.stop_call_count == 1 + + +def test_delayed_start(): + """Test that profiler waits for N steps before actually starting.""" + envs.VLLM_PROFILER_DELAY_ITERS = 2 + profiler = ConcreteWorkerProfiler() + + # User requests start + profiler.start() + + # Should be active (request accepted) but not running (waiting for delay) + assert profiler._active is True + assert profiler._running is False + assert profiler.start_call_count == 0 + + # Step 1 + profiler.step() + assert profiler._running is False + + # Step 2 (Threshold reached) + profiler.step() + assert profiler._running is True + assert profiler.start_call_count == 1 + + +def test_max_iterations(): + """Test that profiler stops automatically after max iterations.""" + envs.VLLM_PROFILER_MAX_ITERS = 2 + profiler = ConcreteWorkerProfiler() + + profiler.start() + assert profiler._running is True + + # Iteration 1 + profiler.step() # profiling_count becomes 1 + assert profiler._running is True + + # Iteration 2 + profiler.step() # profiling_count becomes 2 + assert profiler._running is True + + # Iteration 3 (Exceeds max) + profiler.step() # profiling_count becomes 3 + + # Should have stopped now + assert profiler._running is False + assert profiler.stop_call_count == 1 + + +def test_delayed_start_and_max_iters(): + """Test combined delayed start and max iterations.""" + envs.VLLM_PROFILER_DELAY_ITERS = 2 + envs.VLLM_PROFILER_MAX_ITERS = 2 + profiler = ConcreteWorkerProfiler() + + profiler.start() + + # Step 1 + profiler.step() + assert profiler._running is False + assert profiler._active is True + + # Step 2 (Starts now) + profiler.step() + assert profiler._profiling_for_iters == 1 + assert profiler._running is True + assert profiler._active is True + + # Next iteration + profiler.step() + assert profiler._profiling_for_iters == 2 + assert profiler._running is True + + # Iteration 2 (exceeds max) + profiler.step() + + # Should have stopped now + assert profiler._running is False + assert profiler.stop_call_count == 1 + + +def test_idempotency(): + """Test that calling start/stop multiple times doesn't break logic.""" + profiler = ConcreteWorkerProfiler() + + # Double Start + profiler.start() + profiler.start() + assert profiler.start_call_count == 1 # Should only start once + + # Double Stop + profiler.stop() + profiler.stop() + assert profiler.stop_call_count == 1 # Should only stop once + + +def test_step_inactive(): + """Test that stepping while inactive does nothing.""" + envs.VLLM_PROFILER_DELAY_ITERS = 2 + profiler = ConcreteWorkerProfiler() + + # Not started yet + profiler.step() + profiler.step() + + # Even though we stepped 2 times, start shouldn't happen because active=False + assert profiler.start_call_count == 0 + + +def test_start_failure(): + """Test behavior when the underlying _start method raises exception.""" + profiler = ConcreteWorkerProfiler() + profiler.should_fail_start = True + + profiler.start() + + # Exception caught in _call_start + assert profiler._running is False # Should not mark as running + assert profiler._active is True # Request is still considered active + assert profiler.start_call_count == 0 # Logic failed inside start + + +def test_shutdown(): + """Test that shutdown calls stop only if running.""" + profiler = ConcreteWorkerProfiler() + + # Case 1: Not running + profiler.shutdown() + assert profiler.stop_call_count == 0 + + # Case 2: Running + profiler.start() + profiler.shutdown() + assert profiler.stop_call_count == 1 + + +def test_mixed_delay_and_stop(): + """Test manual stop during the delay period.""" + envs.VLLM_PROFILER_DELAY_ITERS = 5 + profiler = ConcreteWorkerProfiler() + + profiler.start() + profiler.step() + profiler.step() + + # User cancels before delay finishes + profiler.stop() + assert profiler._active is False + + # Further steps should not trigger start + profiler.step() + profiler.step() + profiler.step() + + assert profiler.start_call_count == 0 diff --git a/vllm/envs.py b/vllm/envs.py index 614bc94b978bd..888a09cf6d3ec 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -92,11 +92,14 @@ if TYPE_CHECKING: VLLM_TORCH_PROFILER_DIR: str | None = None VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False + VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False VLLM_USE_AOT_COMPILE: bool = False VLLM_USE_BYTECODE_HOOK: bool = False VLLM_FORCE_AOT_LOAD: bool = False VLLM_TORCH_PROFILER_WITH_STACK: bool = True VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False + VLLM_PROFILER_DELAY_ITERS: int = 0 + VLLM_PROFILER_MAX_ITERS: int = 0 VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False @@ -872,6 +875,19 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool( os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0" ), + # Disable torch profiling of the AsyncLLMEngine process. + # If set to 1, will not profile the engine process. + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool( + os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0" + ), + # Delay number of iterations before starting profiling when using + # the torch/torch CUDA profiler. If set to 0, will start profiling immediately. + "VLLM_PROFILER_DELAY_ITERS": lambda: int( + os.getenv("VLLM_PROFILER_DELAY_ITERS", "0") + ), + # Maximum number of iterations to profile when using the torch/torch CUDA profiler. + # If set to 0, will not limit the number of iterations. + "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")), # If set, vLLM will use Triton implementations of AWQ. "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), # If set, allow loading or unloading lora adapters in runtime, diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py index 58c6689531615..2155b67a3db4b 100644 --- a/vllm/profiler/gpu_profiler.py +++ b/vllm/profiler/gpu_profiler.py @@ -1,37 +1,212 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod +from contextlib import nullcontext + +import torch +from typing_extensions import override + +import vllm.envs as envs from vllm.logger import init_logger logger = init_logger(__name__) -class CudaProfilerWrapper: +class WorkerProfiler(ABC): def __init__(self) -> None: - self._profiler_running = False + self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS + if self._delay_iters > 0: + logger.info_once( + "GPU profiling will start " + f"{self._delay_iters} steps after start_profile." + ) + + self._max_iters = envs.VLLM_PROFILER_MAX_ITERS + if self._max_iters > 0: + logger.info_once( + "GPU profiling will stop " + f"after {self._max_iters} worker steps, " + "or when stop_profile is received." + ) + + # Track when the profiler gets triggered by start_profile + self._active_iteration_count = 0 + self._active = False + + # Track when the profiler is actually running + self._profiling_for_iters = 0 + self._running = False + + @abstractmethod + def _start(self) -> None: + """Start the profiler.""" + pass + + @abstractmethod + def _stop(self) -> None: + """Stop the profiler.""" + pass + + def _call_start(self) -> None: + """Call _start with error handling but no safeguards.""" + try: + self._start() + self._running = True # Only mark as running if start succeeds + except Exception as e: + logger.warning("Failed to start profiler: %s", e) + + def _call_stop(self) -> None: + """Call _stop with error handling but no safeguards.""" + try: + self._stop() + logger.info("Profiler stopped successfully.") + except Exception as e: + logger.warning("Failed to stop profiler: %s", e) + self._running = False # Always mark as not running, assume stop worked + + def start(self) -> None: + """Attempt to start the profiler, accounting for delayed starts.""" + if self._active: + logger.debug( + "start_profile received when profiler is already active. " + "Ignoring request." + ) + return + self._active = True + if self._delay_iters == 0: + self._call_start() + + def step(self) -> None: + """Update the profiler state at each worker step, + to handle delayed starts and max iteration limits.""" + if not self._active: + return + + self._active_iteration_count += 1 + + if ( + not self._running + and self._delay_iters > 0 + and self._active_iteration_count == self._delay_iters + ): + logger.info("Starting profiler after delay...") + self._call_start() + + if self._running: + self._profiling_for_iters += 1 + + if ( + self._max_iters > 0 + and self._running + and self._profiling_for_iters > self._max_iters + ): + # Automatically stop the profiler after max iters + # will be marked as not running, but leave as active so that stop + # can clean up properly + logger.info("Max profiling iterations reached. Stopping profiler...") + self._call_stop() + return + + def stop(self) -> None: + """Attempt to stop the profiler, accounting for overlapped calls.""" + if not self._active: + logger.debug( + "stop_profile received when profiler is not active. Ignoring request." + ) + return + self._active = False + self._active_iteration_count = 0 + self._profiling_for_iters = 0 + + if self._running: + self._call_stop() + + def shutdown(self) -> None: + """Ensure profiler is stopped when shutting down.""" + logger.info_once("Shutting down profiler") + if self._running: + self.stop() + + def annotate_context_manager(self, name: str): + """Return a context manager to annotate profiler traces.""" + return nullcontext() + + +class TorchProfilerWrapper(WorkerProfiler): + def __init__(self, worker_name: str, local_rank: int) -> None: + super().__init__() + + self.local_rank = local_rank + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info( + "Torch profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir, + ) + logger.debug( + "Profiler config: record_shapes=%s," + "profile_memory=%s,with_stack=%s,with_flops=%s", + envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + envs.VLLM_TORCH_PROFILER_WITH_STACK, + envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + ) + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True + ), + ) + + @override + def _start(self) -> None: + self.profiler.start() + + @override + def _stop(self) -> None: + self.profiler.stop() + + rank = self.local_rank + profiler_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" + sort_key = "self_cuda_time_total" + table = self.profiler.key_averages().table(sort_by=sort_key) + + with open(profiler_out_file, "w") as f: + print(table, file=f) + + # only print profiler results on rank 0 + if rank == 0: + print(table) + + @override + def annotate_context_manager(self, name: str): + return torch.profiler.record_function(name) + + +class CudaProfilerWrapper(WorkerProfiler): + def __init__(self) -> None: + super().__init__() # Note: lazy import to avoid dependency issues if CUDA is not available. import torch.cuda.profiler as cuda_profiler self._cuda_profiler = cuda_profiler - def start(self) -> None: - try: - self._cuda_profiler.start() - self._profiler_running = True - logger.info_once("Started CUDA profiler") - except Exception as e: - logger.warning_once("Failed to start CUDA profiler: %s", e) + @override + def _start(self) -> None: + self._cuda_profiler.start() - def stop(self) -> None: - if self._profiler_running: - try: - self._cuda_profiler.stop() - logger.info_once("Stopped CUDA profiler") - except Exception as e: - logger.warning_once("Failed to stop CUDA profiler: %s", e) - finally: - self._profiler_running = False + @override + def _stop(self) -> None: + self._cuda_profiler.stop() - def shutdown(self) -> None: - """Ensure profiler is stopped when shutting down.""" - self.stop() + @override + def annotate_context_manager(self, name: str): + return torch.cuda.nvtx.range(name) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c160c7cbcab4a..abf2c8cfa4539 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -160,11 +160,23 @@ class AsyncLLM(EngineClient): except RuntimeError: pass - if envs.VLLM_TORCH_PROFILER_DIR: + if ( + envs.VLLM_TORCH_PROFILER_DIR + and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM + ): logger.info( "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 envs.VLLM_TORCH_PROFILER_DIR, ) + if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0: + logger.warning_once( + "Torch profiler received max_iters or delay_iters setting. These " + "are not compatible with the AsyncLLM profiler and will be ignored " + "for the AsyncLLM process. Engine process profiling will still " + "respect these settings. Consider setting " + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable " + "AsyncLLM profiling." + ) worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" self.profiler = torch.profiler.profile( activities=[ diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 7f9cdd221224b..18cbc38262793 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -36,7 +36,7 @@ from vllm.model_executor import set_random_seed from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform -from vllm.profiler.gpu_profiler import CudaProfilerWrapper +from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes @@ -90,32 +90,9 @@ class Worker(WorkerBase): # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - logger.info( - "Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, - ) - logger.debug( - "Profiler config: record_shapes=%s," - "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - ) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True - ), + self.profiler = TorchProfilerWrapper( + worker_name=worker_name, local_rank=self.local_rank ) elif envs.VLLM_TORCH_CUDA_PROFILE: self.profiler = CudaProfilerWrapper() @@ -526,10 +503,12 @@ class Worker(WorkerBase): if not self.profiler: return nullcontext() + self.profiler.step() + num_new = len(scheduler_output.scheduled_new_reqs) num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids) - return torch.profiler.record_function( + return self.profiler.annotate_context_manager( f"execute_new_{num_new}_cached_{num_cached}" ) @@ -587,24 +566,11 @@ class Worker(WorkerBase): def profile(self, is_start: bool = True): if self.profiler is None: - raise RuntimeError("Profiler is not enabled.") + raise RuntimeError("Profiling is not enabled.") if is_start: self.profiler.start() else: self.profiler.stop() - if isinstance(self.profiler, torch.profiler.profile): - rank = self.local_rank - profiler_dir = envs.VLLM_TORCH_PROFILER_DIR - profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" - sort_key = "self_cuda_time_total" - table = self.profiler.key_averages().table(sort_by=sort_key) - - with open(profiler_out_file, "w") as f: - print(table, file=f) - - # only print profiler results on rank 0 - if rank == 0: - print(table) def execute_dummy_batch(self) -> None: self.model_runner._dummy_run(1, uniform_decode=True) @@ -865,6 +831,8 @@ class Worker(WorkerBase): def shutdown(self) -> None: if runner := getattr(self, "model_runner", None): runner.ensure_kv_transfer_shutdown() + if self.profiler is not None: + self.profiler.shutdown() def init_worker_distributed_environment( From a8c536829cb7b5564f54beff97e938666f286dd6 Mon Sep 17 00:00:00 2001 From: Shengliang Xu <106840466+shengliangxu@users.noreply.github.com> Date: Wed, 19 Nov 2025 19:39:36 -0800 Subject: [PATCH 051/249] Consolidate Nvidia ModelOpt quant config handling for all quantization methods (#28076) Signed-off-by: Shengliang Xu --- .../layers/quantization/modelopt.py | 499 ++++++++---------- 1 file changed, 234 insertions(+), 265 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index dedab33c1bdb7..6b5ed7762eb31 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable +from fnmatch import fnmatch from typing import TYPE_CHECKING, Any, Optional import torch @@ -13,7 +14,6 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, FusedMoEQuantConfig, RoutingMethodType, fp8_w8a8_moe_quant_config, @@ -86,45 +86,218 @@ QUANT_ALGOS = ["FP8", "NVFP4"] KV_CACHE_QUANT_ALGOS = ["FP8"] -class ModelOptFp8Config(QuantizationConfig): +class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): + """ + Supports loading kv-cache scaling factors from FP8 checkpoints. + """ + + def __init__(self, quant_config: "ModelOptQuantConfigBase"): + super().__init__(quant_config) + + +class ModelOptQuantConfigBase(QuantizationConfig): + LinearMethodCls: type = LinearMethodBase + FusedMoEMethodCls: type = FusedMoEMethodBase + KVCacheMethodCls: type = BaseKVCacheMethod + + def __init__( + self, + exclude_modules: list[str], + ): + super().__init__() + self.exclude_modules: list[str] = exclude_modules + + def is_layer_excluded(self, prefix: str) -> bool: + """ + Check if a layer should be excluded from quantization. + + Handles both exact matching (for fused layers) and ModelOpt wildcard matching. + + The ModelOpt exclude_modules list is a list of wildcards. + """ + if len(self.exclude_modules) == 0: + return False + + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping): + return True + + # TODO: This special hard coded logic is not needed for quantized checkpoints + # generated by ModelOpt >= 0.39.0 where they are handled natually by the + # exclude_modules config. But need to keep them for loading quantized + # checkpoints generated by older versions. Then check substring matching + # for patterns not caught by exact match + for exclude_module in self.exclude_modules: + # Skip exact matches already handled above + if exclude_module != prefix and ( + exclude_module in prefix + or ( + prefix.startswith("language_model.") + and exclude_module in prefix.removeprefix("language_model.") + ) + ): + return True + + # modelopt exclude modules are not simple strings, they are wildcards + for wildcard_pattern in self.exclude_modules: + if fnmatch(prefix, wildcard_pattern): + return True + + return False + + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention # Avoid circular import + + # handle kv-cache first so we can focus only on weight quantization thereafter + if isinstance(layer, Attention): + return self.KVCacheMethodCls(self) + + # handle exclusion + if self.is_layer_excluded(prefix): + if isinstance(layer, LinearBase): + return UnquantizedLinearMethod() + return None + + # TODO: This special hard coded logic is not needed for quantized checkpoints + # generated by ModelOpt >= 0.39.0 where they are handled natually by the + # exclude_modules config. But need to keep them for loading quantized + # checkpoints generated by older versions. Then check substring matching + # for patterns not caught by exact match + if "vision_tower" in prefix or "vision_model" in prefix: + return UnquantizedLinearMethod() + + # now, the layer is quantized, handle it here + if isinstance(layer, LinearBase): + return self.LinearMethodCls(self) + elif isinstance(layer, FusedMoE): + return self.FusedMoEMethodCls(quant_config=self, layer=layer) + + return None + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if len(self.exclude_modules) > 0: + self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules) + + @staticmethod + def get_config_filenames() -> list[str]: + return ["hf_quant_config.json"] + + @classmethod + def _from_config( + cls, + *, + quant_method: str, + kv_cache_quant_method: str | None, + exclude_modules: list[str], + original_config: dict[str, Any], + group_size: int | None, + ) -> "ModelOptQuantConfigBase": + raise NotImplementedError("Please implement this function in sub classes") + + @classmethod + def from_config(cls, config: dict[str, Any]) -> "ModelOptQuantConfigBase": + # Handle both ModelOpt format and compressed-tensors style format + if "quantization" in config: + # Traditional ModelOpt format: + # {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError("Expected 'quantization' to be a dictionary in config") + + quant_method = quant_config.get("quant_algo") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_method = quant_config.get("kv_cache_quant_algo") + + # Handle group_size with proper type validation + group_size_raw = quant_config.get("group_size") + + # "exclude_modules" is the key in the legacy hf_quant_config.json + exclude_modules = quant_config.get("exclude_modules", []) + else: + # Compressed-tensors style format: + # {"quant_algo": "...", "quant_method": "modelopt"} + quant_method = config.get("quant_algo") + kv_cache_quant_method = config.get("kv_cache_quant_algo") + # "ignore" is the key in config.json + exclude_modules = config.get("ignore", []) + group_size_raw = config.get("group_size") + + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + + if kv_cache_quant_method is None: + # No KV cache quantization, keep this branch just to have this comment + pass + elif not isinstance(kv_cache_quant_method, str): + raise ValueError( + f"kv_cache_quant_algo must be a string, got " + f"{type(kv_cache_quant_method)}" + ) + + if not isinstance(exclude_modules, list): + raise ValueError( + f"exclude_modules must be a list, got {type(exclude_modules)}" + ) + + if group_size_raw is None: + group_size = None + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError( + f"group_size must be an integer, got {type(group_size_raw)}" + ) from None + + if quant_method not in QUANT_ALGOS: + raise ValueError( + f"ModelOpt currently only supports: {QUANT_ALGOS} " + "quantizations in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration." + ) + return cls._from_config( + quant_method=quant_method, + kv_cache_quant_method=kv_cache_quant_method, + exclude_modules=exclude_modules, + group_size=group_size, + original_config=config, + ) + + +class ModelOptFp8Config(ModelOptQuantConfigBase): """Config class for ModelOpt FP8.""" def __init__( self, - is_checkpoint_fp8_serialized: bool = False, - kv_cache_quant_method: str | None = None, - exclude_modules: list[str] | None = None, + is_checkpoint_fp8_serialized: bool, + kv_cache_quant_method: str | None, + exclude_modules: list[str], ) -> None: - super().__init__() + super().__init__(exclude_modules) self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized self.kv_cache_quant_method = kv_cache_quant_method - self.exclude_modules = exclude_modules or [] if is_checkpoint_fp8_serialized: logger.warning( "Detected ModelOpt fp8 checkpoint. Please note that" " the format is experimental and could change." ) - @classmethod - def get_name(cls) -> QuantizationMethods: + def get_name(self) -> QuantizationMethods: return "modelopt" - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: return [torch.bfloat16, torch.half] @classmethod def get_min_capability(cls) -> int: return 89 - @classmethod - def get_config_filenames(cls) -> list[str]: - return ["hf_quant_config.json"] - - def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - if self.exclude_modules is not None: - self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules) - @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant @@ -158,88 +331,19 @@ class ModelOptFp8Config(QuantizationConfig): return None @classmethod - def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": - # Handle both ModelOpt format and compressed-tensors style format - if "quantization" in config: - # ModelOpt format: {"quantization": {"quant_algo": "..."}} - quant_config = cls.get_from_keys(config, ["quantization"]) - if not isinstance(quant_config, dict): - raise ValueError("Expected 'quantization' to be a dictionary in config") - quant_method = quant_config.get("quant_algo", "") - if not quant_method: - raise ValueError("Missing 'quant_algo' in quantization config") - kv_cache_quant_method = quant_config.get("kv_cache_quant_algo") - # "exclude_modules" is the key in the legacy hf_quant_config.json - exclude_modules = quant_config.get("exclude_modules") - else: - # Compressed-tensors style format: - # {"quant_algo": "...", "quant_method": "modelopt"} - quant_method = config.get("quant_algo", "") - kv_cache_quant_method = config.get("kv_cache_quant_algo") - # "ignore" is the key in config.json - exclude_modules = config.get("ignore") - - if quant_method not in QUANT_ALGOS: - raise ValueError( - f"ModelOpt currently only supports: {QUANT_ALGOS} " - "quantizations in vLLM. Please check the " - "`hf_quant_config.json` file for your model's " - "quant configuration." - ) + def _from_config( + cls, + *, + quant_method: str, + kv_cache_quant_method: str | None, + exclude_modules: list[str], + original_config: dict[str, Any], + **kwargs: Any, + ) -> "ModelOptFp8Config": is_checkpoint_fp8_serialized = "FP8" in quant_method return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules) - def is_layer_excluded(self, prefix: str) -> bool: - """ - Check if a layer should be excluded from quantization. - Handles both exact matching (for fused layers) and substring matching. - - This method handles both regular models and multimodal models that use - the language_model prefix. For multimodal models, it checks if the - module name (without the language_model prefix) is in the exclude list. - """ - if self.exclude_modules is None: - return False - - # First check exact matching with fused layer support - if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping): - return True - - # Then check substring matching for patterns not caught by exact match - for module in self.exclude_modules: - # Skip exact matches already handled above - if module != prefix and ( - module in prefix - or ( - prefix.startswith("language_model.") - and module in prefix.removeprefix("language_model.") - ) - ): - return True - return False - - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import ( # Avoid circular import - Attention, - MLAAttention, - ) - - if isinstance(layer, LinearBase): - if self.is_layer_excluded(prefix): - return UnquantizedLinearMethod() - # Check if this is a vision model layer that should not be quantized - if "vision_tower" in prefix or "vision_model" in prefix: - return UnquantizedLinearMethod() - return ModelOptFp8LinearMethod(self) - elif isinstance(layer, (Attention, MLAAttention)): - return ModelOptFp8KVCacheMethod(self) - elif isinstance(layer, FusedMoE): - return ModelOptFp8MoEMethod(self, layer) - return None - class ModelOptFp8LinearMethod(LinearMethodBase): """Linear method for Model Optimizer static quantization. @@ -344,7 +448,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): def __init__( self, quant_config: ModelOptFp8Config, - layer: torch.nn.Module, + layer: FusedMoE, ) -> None: super().__init__(layer.moe_config) self.layer = layer @@ -686,7 +790,12 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) -class ModelOptNvFp4Config(QuantizationConfig): +ModelOptFp8Config.LinearMethodCls = ModelOptFp8LinearMethod +ModelOptFp8Config.FusedMoEMethodCls = ModelOptFp8MoEMethod +ModelOptFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod + + +class ModelOptNvFp4Config(ModelOptQuantConfigBase): """Config class for ModelOpt FP4.""" def __init__( @@ -696,7 +805,7 @@ class ModelOptNvFp4Config(QuantizationConfig): exclude_modules: list[str], group_size: int = 16, ) -> None: - super().__init__() + super().__init__(exclude_modules) self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized if is_checkpoint_nvfp4_serialized: logger.warning( @@ -706,28 +815,17 @@ class ModelOptNvFp4Config(QuantizationConfig): self.group_size = group_size self.kv_cache_quant_algo = kv_cache_quant_algo - self.exclude_modules = exclude_modules - @classmethod - def get_name(cls) -> QuantizationMethods: + def get_name(self) -> QuantizationMethods: return "modelopt_fp4" - @classmethod - def get_supported_act_dtypes(cls) -> list[torch.dtype]: + def get_supported_act_dtypes(self) -> list[torch.dtype]: return [torch.bfloat16, torch.half, torch.float8_e4m3fn] @classmethod def get_min_capability(cls) -> int: return 80 - @classmethod - def get_config_filenames(cls) -> list[str]: - return ["hf_quant_config.json"] - - def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - if self.exclude_modules is not None: - self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules) - @classmethod def override_quantization_method( cls, hf_quant_cfg, user_quant @@ -761,105 +859,25 @@ class ModelOptNvFp4Config(QuantizationConfig): return None @classmethod - def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config": - # Handle both traditional ModelOpt format and compressed-tensors - # style format - if "quantization" in config: - # Traditional ModelOpt format: - # {"quantization": {"quant_algo": "..."}} - quant_config = cls.get_from_keys(config, ["quantization"]) - if not isinstance(quant_config, dict): - raise ValueError("Expected 'quantization' to be a dictionary in config") - - quant_method = quant_config.get("quant_algo", "") - if not quant_method: - raise ValueError("Missing 'quant_algo' in quantization config") - - # Handle kv_cache_quant_algo with proper type validation - kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo") - if kv_cache_quant_algo_raw is None: - # No KV cache quantization by default - kv_cache_quant_algo = None - elif isinstance(kv_cache_quant_algo_raw, str): - kv_cache_quant_algo = kv_cache_quant_algo_raw - else: - raise ValueError( - f"kv_cache_quant_algo must be a string, got " - f"{type(kv_cache_quant_algo_raw)}" - ) - - # Handle group_size with proper type validation - group_size_raw = quant_config.get("group_size") - if group_size_raw is None: - group_size = 16 # Default value - elif isinstance(group_size_raw, int): - group_size = group_size_raw - else: - try: - group_size = int(group_size_raw) - except (ValueError, TypeError): - raise ValueError( - f"group_size must be an integer, got {type(group_size_raw)}" - ) from None - - # "exclude_modules" is the key in the legacy hf_quant_config.json - exclude_modules = quant_config.get("exclude_modules", []) - if not isinstance(exclude_modules, list): - raise ValueError( - f"exclude_modules must be a list, got {type(exclude_modules)}" - ) - else: - # Compressed-tensors style format: - # {"quant_algo": "...", "quant_method": "modelopt"} - quant_method = config.get("quant_algo", "") - - # Handle kv_cache_quant_algo with proper type validation - kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo") - if kv_cache_quant_algo_raw is None: - # No KV cache quantization by default - kv_cache_quant_algo = None - elif isinstance(kv_cache_quant_algo_raw, str): - kv_cache_quant_algo = kv_cache_quant_algo_raw - else: - raise ValueError( - f"kv_cache_quant_algo must be a string, got " - f"{type(kv_cache_quant_algo_raw)}" - ) - - # Handle group_size with proper type validation - group_size_raw = config.get("group_size") - if group_size_raw is None: - group_size = 16 # Default value - elif isinstance(group_size_raw, int): - group_size = group_size_raw - else: - try: - group_size = int(group_size_raw) - except (ValueError, TypeError): - raise ValueError( - f"group_size must be an integer, got {type(group_size_raw)}" - ) from None - - # "ignore" is the key in config.json - exclude_modules = config.get("ignore", []) - if not isinstance(exclude_modules, list): - raise ValueError( - f"exclude_modules must be a list, got {type(exclude_modules)}" - ) - - if quant_method not in QUANT_ALGOS: - raise ValueError( - f"ModelOpt currently only supports: {QUANT_ALGOS} " - "quantizations in vLLM. Please check the " - "`hf_quant_config.json` file for your model's " - "quant configuration." - ) + def _from_config( + cls, + *, + quant_method: str, + kv_cache_quant_method: str | None, + exclude_modules: list[str], + original_config: dict[str, Any], + group_size: int | None, + **kwargs: Any, + ) -> "ModelOptNvFp4Config": is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method + if group_size is None: + group_size = 16 # Default value + # For FP4, these fields are required - if is_checkpoint_nvfp4_serialized and "quantization" in config: + if is_checkpoint_nvfp4_serialized and "quantization" in original_config: # Check if required fields are present in the quantization config - quant_config = config["quantization"] + quant_config = original_config["quantization"] required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"] missing_fields = [ field for field in required_fields if field not in quant_config @@ -872,64 +890,11 @@ class ModelOptNvFp4Config(QuantizationConfig): return cls( is_checkpoint_nvfp4_serialized, - kv_cache_quant_algo, + kv_cache_quant_method, exclude_modules, group_size, ) - def is_layer_excluded(self, prefix: str) -> bool: - """ - Check if a layer should be excluded from quantization. - Handles both exact matching (for fused layers) and pattern matching. - """ - # First check exact matching with fused layer support - if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping): - return True - - # Check regex pattern matching for patterns not caught by exact match - import regex as re - - for pattern in self.exclude_modules: - # Skip patterns that would be caught by exact matching - if "*" in pattern or "." in pattern: - regex_str = pattern.replace(".", r"\.").replace("*", r".*") - if re.fullmatch(regex_str, prefix): - return True - return False - - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> Optional["QuantizeMethodBase"]: - from vllm.attention.layer import ( # Avoid circular import - Attention, - MLAAttention, - ) - - skip_layer = self.is_layer_excluded(prefix) - if isinstance(layer, LinearBase): - if skip_layer: - return UnquantizedLinearMethod() - # Check if this is a vision model layer that should not be quantized - if "vision_tower" in prefix or "vision_model" in prefix: - return UnquantizedLinearMethod() - return ModelOptNvFp4LinearMethod(self) - elif isinstance(layer, (Attention, MLAAttention)): - return ModelOptFp8KVCacheMethod(self) - elif isinstance(layer, FusedMoE): - if skip_layer: - return None - return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer) - return None - - -class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): - """ - Supports loading kv-cache scaling factors from FP8 checkpoints. - """ - - def __init__(self, quant_config: ModelOptFp8Config | ModelOptNvFp4Config): - super().__init__(quant_config) - class ModelOptNvFp4LinearMethod(LinearMethodBase): """Linear method for Model Optimizer NVFP4. @@ -1157,14 +1122,13 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def __init__( self, quant_config: ModelOptNvFp4Config, - moe: FusedMoEConfig, - layer: torch.nn.Module, + layer: FusedMoE, ) -> None: from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( detect_nvfp4_moe_support, # noqa: E501 ) - super().__init__(moe) + super().__init__(layer.moe_config) self.quant_config = quant_config self.layer = layer _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) @@ -1802,3 +1766,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): k=x.shape[1], e=layer.w13_weight.shape[0], ) + + +ModelOptNvFp4Config.LinearMethodCls = ModelOptNvFp4LinearMethod +ModelOptNvFp4Config.FusedMoEMethodCls = ModelOptNvFp4FusedMoE +ModelOptNvFp4Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod From 0cca9b4d130b4caddb60086ef26a0d8741582dcb Mon Sep 17 00:00:00 2001 From: prashanth058 Date: Wed, 19 Nov 2025 19:50:37 -0800 Subject: [PATCH 052/249] [Bugfix] Fix precision loss in LoRA-wrapped RowParallelLinear by fusing bias into GEMM (#28972) Signed-off-by: prashanth058 --- vllm/lora/layers/row_parallel_linear.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index 2ef1bd98fc612..95517b1aee263 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -63,23 +63,18 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. - output_parallel = self.apply(input_parallel) + bias_ = ( + None + if (self.tp_rank > 0 or self.base_layer.skip_bias_add) + else self.base_layer.bias + ) + output_parallel = self.apply(input_parallel, bias_) if self.base_layer.reduce_results and self.tp_size > 1: - output_ = tensor_model_parallel_all_reduce(output_parallel) + output = tensor_model_parallel_all_reduce(output_parallel) else: - output_ = output_parallel - - if not self.base_layer.skip_bias_add: - output = ( - output_ + self.base_layer.bias - if self.base_layer.bias is not None - else output_ - ) - output_bias = None - else: - output = output_ - output_bias = self.base_layer.bias + output = output_parallel + output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None if not self.base_layer.return_bias: return output @@ -120,7 +115,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): return lora_b def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x) + output = self.base_layer.quant_method.apply(self.base_layer, x, bias) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape From fe25772aa97beb8bcb07ea49e06a2892b521a7ed Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Thu, 20 Nov 2025 12:38:12 +0800 Subject: [PATCH 053/249] [Bugfix] Handle broken frames in video loading (#29001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: gcanlin Signed-off-by: 凌葭 Co-authored-by: 凌葭 --- tests/multimodal/assets/corrupted.mp4 | Bin 0 -> 91678 bytes tests/multimodal/test_video.py | 37 ++++++++ vllm/multimodal/video.py | 118 ++++++++++++++++---------- 3 files changed, 112 insertions(+), 43 deletions(-) create mode 100644 tests/multimodal/assets/corrupted.mp4 diff --git a/tests/multimodal/assets/corrupted.mp4 b/tests/multimodal/assets/corrupted.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..c355bb932ceeeae13cc2d0a4752dcdf8c5136720 GIT binary patch literal 91678 zcmYJZV{|A@6D=IuImyY1ZQHhO+qP}nwr$(CZQHqd-uvOl^z7cftGcSXSI=5A0{{Sk zYwYM|Yvy2M1poj9@ZbISqStdaptG`Nqyqo|fH1Z zTQ^H1V@F(iT3QAwdRls>U((dk(UzTt#>K^j+S$y|$i`C7n%c(RgyuiL)TWMBmcJSs zTSqe+YX^2*13i5`11>sTdn02m23$iUeM=hy3obf#T6S7oJ!?HnHwPmwT2}^kT30$c z23#v6E>j~{Tn8upU&MlI>)`fl`rGQ*8*)1u11DB|9PbQwa~HGvo+y4~**MXg_j*YRggOMW_6+Nz_sr|2o0~aH%rHze+p6PE#=l|y%EX@pl zWBGpxI^6#e|EaC*|2LU|nU$X7e*u|UI~v(r>iw#Jd3{SKdp$QD0~;$_J;&d^!Edb` z?e)y8e|`N5?e+fC7~AVv89Dq`O<%{>?H8LFa?$@{JwrX)|I*Rd(Kpj`_-_(3d!zp) z=wf7MV(O^>J7!~RWUXUjWBWV$e@WY4Qwt-vU*BBxOtk+G)Uh(N=3>HiFfg($!3H8HdOU98`Y@w=A4zVz&V_rZUw3g8O>fY@jh8VI2A zvu{t=>S!UFYi^@R=LI9)fqSS^NNgmVo^)3?V7)u;f_ak5?>@9YGvrl$R^ znP+^J6xq+sI(DsF*>p0}8m$B!x|G#0)yvyB*iXl?@Or`Yz`G`-HB+OkhDMV1;)PLC zSz1;b?dP57Djj$Se7RT0_wA7Mm`gZ$YLd-*qO?CtRtY5kPU4BD2v4#CqBPwT7f8mg zCpO_(cuNn|M$AX68D}i}01(x?OPbRE5-82_*UUFnTfBv{Z<_2J(Wd9-uHKzoeJVP> zwpoyH!%lTPtfZ_S*j1T!Wgsa}9RPcOzaE20sc0%+TUyvjtt2bliJ6@T6@A5cF>Q71 z6(1X&ZeTWgy^Y?9Wg6QAe*Of`Y+Eu`WIPe{(tN)?p8`D)SUAt0a0B-2-&K>%ji|YDb3Bf0n3DIi-dAh!6@CXCUM({{(09A1@udG z|KuI>FZJ(Eo@PX#3kmcv5UESADcZ2)68a`2k~ww)uYaDAv_ll%QAQ9CF3*D^p_gs5 z2Sy!a~{V~ zKI_|2SH8ZAXTk%1jmc{-1wUT15Fj2lB4}#V$GMexgIt%yKODUZm~BmXr+ z12#b+=M&P5_5_Td9Qn>=nhszeJ45>+GN{hO#$z_XTo?}%KR&*T#iNg+W=i6ok>QuB zTJo1d{iR!UH3GN8C-YQ|%~M3qE(Y7$cS(7)(5$-Lqp<2JyQho+&!yNMQf80m*eH(j zfykGPM2{WSDBKTh&GNg}@iuwKqg_5T+68&sn_^XA?l};k45G$fL|k17?Ri#iK)W*7 zAfqzQ9&oA`ta)IE)!;8xO%F_Y&I(UGgpkNSf_78S#TUG+^rbwx!}u-X^lL?EHh0zx zw-UUIM%W)w1avc)G0Pk{O>}6bnB`5N zn!C}F^OMpE_@zgR7p3}c+9lV z!9Kff*w{U_bdX9(5#Iw+xqs8KB&=JywC5J?vwRRX?g~m%1>7d1@Fanjw%s_FonMr` zrZK69#I{l@j3Yl2?RVVz8%gIyJWYa>O#}N}ib6L9{U~rB3D*IDO)I;ZC{a~Z^8Y{a}_qp5rE+O7;SLEAvQB!Fc_3+_r z3HeJlAog}TLRVql6#8+_N3-_$UVe5$$_lBi7vmM9tpur%g6W^{$}4G0Og|CA?+2N; zWc?OPqla=FXN+fb`KvDX*k;mb;P+PK-V%#?3^9&-X;F?2r(xOv*oUktl0@SFxp3|} zhS$eBNf{YTuLaYN#Kt4^20}FP;M5R*)&VW~s2bF~3KpoXv`Y8MwxdH6owenHB2J{#O14x_=7{|-%Fi!X!Ykwoh%&@Lo z+t`3QBq}YA?~&v#i24ilY>AKwDv_j&OKv#@kWrZ`Kl>OP%%t~I zs`C?efC3iaPKmVCf9F$T#wI$niT~pe6UO5<#}i-FR!fo1YKrx4?_ktu+y%A!QMAC` ze#>)W^3Qgc%c+HKGMVZ)_Uc8Bdy%D8W!)so2~j)4Fs*4&x|jB}lZbE{NuzR?OpJ4} z$hzzX6pMb&_QxrTh^`x(Sw?YNphH#5``W`{dyQbMJ|_+UpKC|0JWmcUxmO&{{Vr+&)?%7_W>one>7&-OEpwf<`F4b+3p&w7u zzagF-q*NkB~y~c0`Xdbc6`oq2P-TgbehL z1jSlN)Ct=+`u-$P`DLZEcW>cvMvKvF{kSre4LN$d@6zI zO7a*BflU9;4JR6cceR)y!%M%Y(TadpmM?ZS&1Okh%^$c@%H*sYc~(O|6YlW!=z!N- zXOfrLl?;4{!gJ38uXRpnEHU5O!a}5h9w!mAqZJB@EeM0az^*@PndcwM(roY{Z=9E$ zhfg}Q@M;$yG~@yq-FCiiwOzbqjr>oLc3nWZO96eXUpPw>MerpWe_@M;x%Y$afh5aiO5XBv*j3_L!BNiPg=V=1@)SKi%>I zcu%NE-s%H-K#`u1kO_|ijPp$s_oEbhZ4xAP795;Hk1C_hS@6(%uwzTK(zzsm(oXcD z_x5Ev!1{kejVRq(Q4c~%Ro7wpQ^Q=0LgQlQimfmbyg(H5GHzM0vpha%-HG!OMF`)r zbFNq>D1w4Gyf`Hm1KG$fOoA>70_VnP(FXr{XX$4>E4RSz?z9 zpeeF*LRs|^_HZ;RUhMJkdn}$KO>SKFq5?$1k-DxjK~l=x>ZRuyjd9%%1Q#9)_a(C~ z=DBjFv7_&UjkxR>CUboR^_Tv6aunJl0g@#@UHYEpbDmYH9CCi~%FL|xoIlt&P_+TA zd^$MBM8AeRN%!wgz4z}Tlb82}<10pgKdGyR-r9(t-^yOzZXD&=>fYzaR?r-KXste) zT4qp36>;In=yDKz!H7{(1~Sx8D~c%MMP(_dsA^nL*LeRlF^ z!+6_pEq;i{B>m%pZ%8+oO=$w&U8MQ#nUqEM6QR@E<$`@$P4oU5HM3mrAt5H1uCbz5 z_xqt;%+aF2tiYOp+uh=*CZPGS1*+^}M&<#);~H2+#PUuCe(3Ny+c5}`KY4~iqAu{z zOu|Vy#VgG{_O=67L_Lp)K*(*Cn%oGI81N?j%|nKo199#T1PdP8zFNqrB{y{(WyvM1 zvx7EdoZNvXE_b!vAx4-*Mj+>rbeB4+ti^`xzQtZT=WG!}^MxY%R6rr)(cBrawttK| zvl~wR%Hq{Kx)+N_a@M5Ja}S+r7tOpi;EjpBsMw!Zip@Rnr$UNeNzMU^&U~ejOEI@- zp(Re=#HmHNu;nGnxaF5bk~1x%&p&YpaOIXbiL|`!q#_#3YesksIcg@WVHqfoao^{i zVow|x3m%gU0qc|Re63+!RE1M#U$iWp!ftqHB_+DuyC}P*BG}T-YiBkwj(2241!A<{_=&nQUFDyd(Mve+jA zef6HrI>C>RzJ(m-$KCdAv78V_hmfoXC9!ok8?c8(X(y6kYd2^!qp=CEiB8}zbN?gs z=+D-N5#eP8iPQ%PnZTT2@fY<`=A)kk< z3*F&J$33}{SIn&%8%2kCytn_$jEF{Q-x@L)L6V8s&-Ap8Mm1W}vQKrVORnv=dq&0k z{juc2_SyanDs4YtMUd4UI^76$49x&Ym9%1tXOLY_OErSyQ^afqSwpK4ZZ5roD|pz)SQHm+FD9%C{1~M zpnbbl;JbzEniy^#yep>sA+cj-B38OxqZ_+6su4OXJ^qO+%|@BUE!$%(xabAK?7R6i z(cs|6EA2@ZM6}n!kqKH>rnIlN8^_(0LHZ!U}+qQxScef^YxM6WK}Wbc$w8&kFiS^OZ2(AW3WP`?N@9L7m4J+l03*pMLBTr|s=<+KWgW zpE8U}@5qW_p#TZ5o}G9196NYq<9R;wZbu1lhW8*vVy?cjb1qrSnOmePp-YdSt2ToL zQ>(|W^B3k_vWfO0L$OA;S@}EPgwF!k%>t@!*yH2k$#t9 zyH=gImzsz81DozoWb+?pJd-p}vemdE3;K=JZGpM4w01KY2j_-G#Yl-3=D7<;MYPOi zpcfuSR*OezqLTz(&r`*y`58m4=O zkqUki?^cB#tQVUC_;PRYuK|v%V)eL@l3|?TuVbHDlpk+%D~i=c)ck3<6v~=whozm9 zcC2Pl(l)0nnFF58;2IU-+#CIG?F39Hvd=h6^=E_W>$hQ4B;ywvFYU@?Jx9Z;A4!_P zj^UjkpmiDHb;r`r>OY{@qg(xHRwk&o`T^dR&SdbHX0nCOgJ)O8s`k473_0G((F|h* z*x1~p?9ki*0MNjUfFMG7L3-k7r%EJC(Mo5&k0I#hCIPR<8NCv(a6|N3_9dsjr)o4q z_#N8BHmCaEA!R?@jeS@{0(}0EM@O+G?`GMh0zn9AeE5A%d_ON8A*{RBk2T0_gtqQK zF}_b@ffY8|04otGV}!*r-6QR@pT}eqt5Tm0#~j?z5`GA}I}Z!#Y6rgC?5+YRSgZ}j z+d^LL0m#0ZLGnvl)=a?8CwNZ1zD9dE*Y*i9cD49pJ;Kk2#cKu79G9M0^y}iQvy1e# zA-9j<#+2qH;PDsE{DFJ4a}0G$pQF^*FreagQ6rTS$>)_#2 zoredMWSa+pej-HL7VVcFMmv>x$gOr>T!}xZ+VLGK0(!5)V2mqJ0V8b& zcyj0ge8-l`Ue(RCeMg?C(l&Fq1A1zu$`=i}uXzv$D7OrmUt;FupmwC&j-bT5BXkG+ zS}R$+=M1Zp*M!q$0{Zk&q?C*v+iBptr##XcWm!8&z~e>K}_43dSYP zhxl)^V>ooukUT+#-siDBB|ZTf2$%=$l$qWVR7XI?yNK*HBN8Z=8mcllA=K41wHh(X zCc443ZXJwRDD(G$I~UQ8V^QcGt~kPu4|+z4^^TmnbD92phOI5hh*aa~!BCJPU}1Rn zgA88Uvha|D!jMpnd0vbNnlDpOr)15KVCAb+gw&ALm*783hJf>SZM*z0 zI-ze0pAn_nCjPHnG}dck_V$fb^!JfQiNbu1RX?eJG4297AM`GDoOT+z`w-FNMy0I+bjDf|KdvjkQhv9+C#oW&a0XO9F+E80uLDu$`F zKZWRKa3+p?O@GI;z;~5^g1&^pZDRWOm|-M`Yzm-!pg`i_{X1PL_x#(k5|M~Q4-;S8 zV*tLUTCS%kd1odM`jFbBi5PL1Q96j0tUfQrmV@`6g9u4PosDt(sg*R)MLE55Xp}SnU zZJwoFdQ^r_SgoAI7u450)r^CObQUNngDOtgP4(`gwJ2@C2Vn?1PzbSIA#N9b)B?8j zbQZ2-W{G?Mr%9EG1^+DB0*e;)L7(H25D6lH?H1EaCmNbA@9vu9L|Yg%iP^-KUv{95 z5e<`A37zO+aw(|Gj-y%PL7%#>TCMQaWUyv#E(r$_uez{i?s*vh7{Op|r32enH$Vtpr06QY%rcc%6i9`?7P!V(7XZE(^_;K zWF-eI7V^r=&rw4(sP#&+pYpUPq z*_GQxVWbCAmHYMA^LpH3{Nx{ZuMM)TRq7X+fC{whmv&M2ST--n3y_?Lt62=M-kDhF zMMe|4tG2bIL|bPP#`hw~mtaWuF>i?zSiWwUE??_{)v%BR*sBJB_RXubp2$D zGx)ne4Nerz_$ceR%%pTRYrE33)~VXaPXZ^}WL6R&+2vKlcM3{Mtf#70jBKF0J#T_6 z^wO|0Ds#(TaXqm{tKJT&BL3>1xM>wcIpVb5hovS_laCEV+u5~b3kivuelLipk_eaXRH7BI!sN9*b$H#A_r1_E z=~EsQT`A5-GFm(}HokC+L3H#L^7r=@B}I46@nP}O_|4o4*rHF6Xe;NRM<&)-4~=ixof4^--|pG+()wT;jsI zzUSuYUV*iwFyH^&*)J^yo_#O-Ml6@VlhnKJfkFR0_6dBIdmIWfU#`Am9g(eS0^@{rVAIH%z=4)-F@t1zJ19XY&*#Y1S?pMkVJ~ejw+6CB}`P>QEnB6z`WVRuDvSZV<8KgQyUbq@kjsCszTm}>VcAN&$n~7`L#pbNe2bH zMmszN60;-?^{N3$2wm$K`bkM!uA&z$ALXJawu`HGNttFM7m)STP}^#^&pS0ClVU&@uLh&Ok1QFmSwDTPEqQk;R z)78UR%<$s5Z+%y#_=x*={2)AdpQ04lVUm=W*y~O%=c#3|3R#P2)l_?cBC) zoy-v6fO#DZ_diosKAnEXX0Lb$@h*TAGLXEs2d+r`-YTUBgAxJ-$#Y+2JwnoZiyVbm zLu&*?u_ex^E5zXqZvC+EUZF7bVTjKHx0qPCiv${miOobAJi-HOUgZeEy$#T^zD&&N z!4rtdhAyOTR6~NFctiEiYC+YfeB_|ugAj!hWz#(PdR?SJIDdCaa?Usyj!g#|ID~`H z4)d<~m-48h{Z-rd&Swg=f0Qu*19d3%OcN`Qe&%SPfxy)#R06t3oFts+5fOG#=tKvn z`Z>4GlGYHjnA~tpxAR=l__^q{;|(BKB4(Mc+8+I@G*!yl zGAc0l)v8tFT>Cm8_(Kbbqx>|%uULFZ9C?(+i}HQr|2KvyMe zr7-a+ea#69zle5sxkDVCH-1q|)X*yg{s5R0+cOS%1X)d!EohQcS9(BdC$ogr{g{kr zz4PSjMZ}k=%iQAo3SmJ-0KSKQL@SYJ`BgNyUko+l-eHA)CUlA08)Zu6kR^qTtkD|l zOhhKCW|vVs;j$71rqhWc{MiVe*10v#pstYwp>KS1v-vNatSFSW|6i+{)i1*!-tGE$ zeKTk-aQpYV<(nH;M}%8dFdDP1x2gW5Pvczp1orygkDK%yt#aym3>* zV^NS7qv1iGz&BY$vM_XWm8#m7vTLx zI(Dr+k;Z6xP`et(ObMo~)`Nj}Tj_rhf4SWG1?-J*a-I&QDr*>}zvB->JMI~ov(LHd zSt382IUl(F`rC+@Ytq!4npOtinuy{}jZsyy{@NBWudW%g3$S!B!-}eK!~3&7Zu@A2 z=D&@j4QNZRl}j?{F3P?eO+Wpi#a+m^rw>M)%Iu#I8e4Dk6kNuv#^FsKduq+?c=OBX z1(Nt_wW*2?CgO@(G-A1$Zyj(xRP*y5oqpAY0EV@&_=>9%huk&@hOvaoaw5h@Mifb2 zfNb{|puLKYogCmsH@zLgy0>+eSiC*b9&i}q{YUdIi`H3HfeW@#O*7%d0eWu*Q3hzL zQv(MPt7xk_{+I1TTO6mN88?HfSEL!{@6z9Pa`b!{F&+rt-S-#Mp{|OjH)zJK6CXO+ z*a@7%n{X7-rP+>A@Rn?YscXA37_V4#VcN=s-a&zc6Brrk*I+&ft?+9TDMX5#sAK4m z>N2VNXlRpUi6RbOzo!gTg*CFtI#-VTGv*y158WSaeX>>sci$z%KPwLY7UjL|i^30~ zjul{6lsXRLP+Z>M#GAb_xp4y_`elicydihF8Fn^eC!O^< z*L>&ylB6}Rho+Gbx!NOl4HWSvyuW;@>Nh3Pi=g&QnjtX~(XGOsP)cCF_4lAmq`XWL zhxMp%whZ-%;FxE0uc6Lc6><;IbgWv5uoIzn_OL4_HCVsoD`DC`P6b-y8EVHf?R%`} zAMf&U!cji{U>_BtU_MpA9cz}={;8=I+>>|O-b7n+ptFH<`-0efrT6r$pXT!iN~+X)0o5W~K-j&(5o0uRm!#G%cg(U7`u@NV zo-<8&SBuKsU$Qx{)Kx#|R5l_Z4Sdda4W%mS{iVE$EY5K$d+>)yu2uM6q_;ti9;)I} z>0dCf3$R^_OKuuZ=#;}S?9;%p!OwCeozg38uR`xj&SxaZw}I#V)t=`fvxQeFv%4O0 zDuHMe7U0rf0M`>uyL|7^C-Yc>%z`_%k!omuJ&A7L_kWrU_C9-=ij6+XmEgb*@em>g z(BR868$LeN_mxSFYTkjotDX*wXHYfQ4*Klg$VUjL!L(Flv#|UdA0Qbo4#8c{6uYP8 zkvr=l3Iv2=4uIlkD|N{pb-E!ES_2$EY+TCIKlyST$p?!9TKjce_NUsE6KOoSg&$A# zD!p-yra~f&9}bR<9I=YUvyN9cEjh`g_Rw&9^8Fk!7}6Kv(N*)5Dsks{eOW7W|9WL< zQFwa~FEj12nT^^AwaK3(QpwK&(jho7aeAhxqzXN~pLuK4M5QG}m z3I3s=yUbj}lsyh=oA)rWMI`v#kPO@o6OZm@L)<0#v1JJkfUGKu1|xmt8H&bt*n*(3 zKW>FC=9XBY+AR7_(#YHHj%wkSYweNNEN0}UJnf{G{V`?oy4+ zy9?(r>Et|e*T z9_S`X`}TezsUpNpvT%3gA8gfM3$qH<_sad(_a+UD5!-ldu%x})m0*%90ZR>=;?9dD*)v&;x4;KEuIv{q*(#lhYURPM|casCw!##yj z1D}m+EN2*Au!2sDk|M*Wo+HbNrxwXCd#Tay<4hV}c7xUkoJWM6G+6{mg5R#n$(9?w zMA6I0iToIs69wtSQ61zaV}qsNas`(Fsby4o*l^x?-Yb7+)2Bxz=6UWK91jcvTSbu8*35i0 zEGOGPA)U<6UCQ^>AXCh)qJ79PQE4;U1kn?NO6&;xhLpv6wI`F;(L+B5&+-e5cWtkk zWN{P%f=|B40%9yX;*`=SxiVrf=UCmFr}GfH?;^S=2m)iyhrStKAay*k&OC>A#t)~s z@4qhP#!J}#2r*($JA|?HJ6T$@dO%{a>}Y9+aRYE#<;^n9iD}$fhxZE4e9pIWoFigLj-1u@cyWc9az2WXF1CR?3RWEz>cD%UDBWBt;1) za9p81lneYJW^BY|YW&0wRF2#~vwlY}-N*_DIX>yw!F>oSs5;`#?zk3~A!qMAfZvlqp6kpbYm9g-23-Z&u z=;jwP!818B90?0TY7vLUE)_45*@_4|UOG@7@el5|G-`nX`W78MI$QhR04PXtw7M9!*SXUXjbhse-?6D;5js}JBawO z-@~?9^ew3Xch<{e%QOpr3kwotlZP4dS|9;|5$8R`_-@3>d_9WS)K8OM(DV~6y5f*I z(Jh}LpA8RO;uvBOM?E{ypIN6N{N}>!0H|k)OrDDF$hS;;9EglxA(?faRg+u1?y)}0 zq6K#jfBB#NK)UXLI4P3!?S}bCyXU_AFvxmGAO%O^KSQGvn__YfV4p!vIcI@Ou6yX6 z=>?G79Q`VZc-I>S&LW>FDPas)yCS&8FkRn~5r7DN&Amdon<^45#_?=V+va4NDq-CF z34gNq^=P7W+VDACHRfw8YX#f}6%e?}lQ04(vlgAtaSz-g+D9H5@l&20SO$mw;tK<1 z=BiXsGZ>Y`TsKcIIOiu%{~xG86!Vtcv!wOgo^ABD+ELB*bp@gDpoWvnu2BV)L4HC% zO5hcmW-SHgLM+v>t~}z;Oqa(3oPzkf{wngrwj(amDN{ipXMFuB1-Oz(ESPum;RkBY z`J z^gpH4mKLt$_N;rlLm{$$065`vh}(mrKn=>4roX@M=YbQqVKzgxK%miB*dHtP&sp*c zVD!&ZiGXPEkCpGCzlld7nhcuU{S%s9gSdP3Na0+M%Le|9Cuz$)iVBsq2t%Un$cK%8 z0SgBz4QWdovu5cdMT4-Km1PbQjF}q+vw8O_GQ~t!hi#Y3LbmZijp3B1)KTs8Y)I`qz#Vs7VZMQ7Q{eB5>H39^}M*yTrBp2mM;-y(j z2YGy@J*{1hrjPb4qNa57v-M^TDAfZMBR6e2N950qSZFx1jjHKr+E zUY$1)n5r^uK%uT>S0Y)4_%}1jDK@<-=iR0O0GY^$=dt?pw0!eXM~{32Bmq^vqmMBG ziYmlj`KZ>PIvizvwn$Fzd|#?V{bD>?oiPS=#aUdB;KQu#E~vwtB4rdLoq8DY?c?DB zH)V6(q3B^V@}rA)+EWoEkYd!i)Tx|2qAMk(DOHgqx|!$RO*Km)+xztCAn9=r?y7Z* zv@!5i?~a=6Kmq7*7?d+1S(UM-FTS)-`Em`NBDHJg>eDq8q&XYnhv9QF2CSIqj zzioXPj>Y|VSR8Ln6t)gZaP(w>Mc5LDlhZ6dtnn`utWT(__WqMzDGnj~x^3y@y`MX{ z+vI^h*0+97pQZO!TdyqGY5kJ3lt`^^k*wk7`*kPmziw|kXgD9JWTPD;LVcS@ z?+%KU6=Br*Df5hv-DBT(uVSeml+vk>Cq0^od7S|>|F0a*Fo#c*3_(Y32603tr=ftSoG#WKX_Wp zqI?dQpd4v4NBKbl{Z^q`dlZaKfGRZNl|z==Bj9ggQR6G)v?Fht>`B!!&_VVAFH6c3 zMrc$&cl0|f5P2KQx~dI`d)uT{W7$q)fv@UO2^9pH(&rRTv-A*7Emi~D#E~X_ybX4g z+uHMgFs-imF0mQMdVq?%T;#LOZqtTgnh3QuQw^Z4012_4P*n}QflXD?!fp-L1sy?I;Uh2&AQ|ACHe`bKJHt4DsVz?DI6ANO5X)2H*|u(# z2f}$NQGprRdu|2s|Cy5yG(01dYP5@WiiINd{1e9I?FVd=_@f|S#_Jfsxl3h+8F&Zi zqt0_`)WYxTPaq&jSXz9pQnlH0XSo~M<(1%}L6r|JFKwH&SwG{Ya&bo3lFFVcWOT^$ z!-_7!+!59~0 zqhaj&FBM2zx=6+FqKD`su4C6=Q9+8z?2Ncx!1@B(tZ7=~2^tEMlBYm1g?u>}t0z+q zG0Aaf2c13z_COV0JZbMpZ&DAg(WM%`2sDEiXNtFOpy4^@FPp z0aQgxwK`0)68+knFsJ^sbi^V=?@2g~qwNV$<2Z>&+p|uagou*5i@1mIgvk>IW9TX0 zVK=?__c-i&F_7bIzppGsq$tw%?b#Z2!m0+Ay}1F>k~-F4nX?#Z!KJ_dRz-pc+HAMj z&F&y*yu$3%1`A==F>yzXnI5^2r1|%QR(L&AwgVH+;p(mlb^4Zne633pCef$06-rwa zyj4!xHh4D^*RjKH+AC(249f0zDLmg^r;nusm*uDRH5QWI#p*>|PA=OP#lIt^;k#vD zBx=D5AovzhyLz0~Y&LVbfGxOG^$l&h8 zoq8u^VgN6Rqyk|7Flb?nK#VH`ax6wmtuwH3!pK12b~!))h?|0?R4WJDD*LR ztkxuTZr)p7?pdoHo*F9#618$~EXc*jgfdO>uJTM^BB5^8Y%^s4xt78{RUUgxps+{E zzTwfq?_rfu4aZCWw&=Dux*?VEYspGb$xr~Po@y*Z)`ZpmZ(7#Z#85M?FJ6od`}Vt2 zD@9>elu?3Qvi-9f@iDTrMB1!f)jjQ#Vozd=^BF^Hs?}KKgGPWBsWe!os_n#gIts!k zzrg`x5M|z=%J44QXoMgTz_-S0kg&zN@V-@;#*VTWP%PIcTL4O#!^1Ao3_>doyaCp%jT=}gb2NqFoMufnh#7x(vCwwhnc@i5 z&^)Rhes_SZW$gHuqD)-5=4F392%IV2`y#E?Ld3sLAbCJA-d(jqzdvEPuWmxZSWWm< zxjbC|*#?EJlg8xLMx988v23pGaPDH}2ycQ0WWEj;7Q84(LJq-(^zFo33J72!KbPK4 zOhJI;rMk;5OB&a!M(J!9d)Fw?$#Tr<(kmg90=YE7dA$r8Y_&aAGS zQ;R4`8Y09*_ZV>l(!ogu;WYbt*FNYv-$MnOTjVmGO9EF!nJ&6kQ8?M`y47F7&jXb7 zGe+@7AeAEX;wrU5SCr??XiB+xBkp-cu#f{5r52rp5e2Qg^J4$Y*RQ=G$5zo4`Qx|sJ?p&92uL?f?%Ilut>)zWMcH<$a!w9F4E=6x;;+t9k&4!3OMj4C4lra6VMb7M5dFWV%z;s zsol|x!#@dTy0#R2O+n_|a|#Pg{v-;ahSR=&>Xm zu0qTlIJ=bqZd$YMp>CZQ1cT+AY7oz{H{3F}LiO+=p)0ESyn=e_8qScgW>nI{k`rye zV+Bd9hp;L+3ahbtohx?S=2T?di@!L!5@Il9x!2$5pz>_ST-xRshFE+>Ug<20C^svY zY%%Ib>=bYKx}d|M4zmi%*9Ks%pnB*5;r!_d`c4K_9a-qoomO|xqoOYAq_i&44UONi zjaO!Tu_r*~>U~hUB5(GO_%nyV;+e89mAhg)sl-sKY`U8+6tghh*%nr6X)=ase+WWymgjXFo7ZvU>f%Apa^P0e>vYxk!R)w1fw{TmvAaAK)sHL-{tB$IaryxfhFL!eAP+peI`*RO4FGx1;H|r`b zV&D=4#d#gS^_=c)kcQ~ro1PPw`q^yVaXo_6F=^QKL`}&BRe5C0YfNHUyp=xfVz}1t zbR~7^yh18<_C}MuRZcbPlgYAUUsR`BZY5T(zKPJr&h{YKZWPusxFRRIxBrNv|_!$@FmNnXJUbT(%goHIz8k2%4XS9*^)fsY{I;F#3{eL zIGE+jx=`wuv($h#2Z_(c;=NMGINKXp0juDBg&j(#7UL^Ajw72hnxGllorp3J`QU33 zYCJ?luyy(rx+12+whnj+&o4Ao$GLH)y~Rxjz=Er5`dgye=!n!qgJLhVU(Gfq@1B1) z4=d9-yg#%Sl`1mLA?%8($gG_G5i3CWeJeh0LQF=jL?J`l(Fum8x-ge4|?THx~Gd%-8om~=jGE*(~R+Gqw6?$~CB@HJ+pn{%TDV|Ae+$(xj^D(RIubb(+(U{l5BrXC=~cm#u}h)laEd;L2SwnkzF)PGKql;CK}O^PN2NMwa=QY!=~nNo-UKfl zQ&c)>d$Upz9plsTGQ?Yf7)4mpwEwb%7Udv(Prt`_i1i_9^GHyxg(u+*c|xeVW^lXa z4|dpJ!Y=C$q)}$?Bi0c2#U*5M90ePyLepbLrmW=p=1CjJo`kzWXzXZ{QiiQDK*1>g zCk*V3Ni9NsS|Iu7Bw9eDwD;2WeowWPk=*I+QBEfn{zUttKW0qnSdt!x<2H z&NWwo0G~_E37khUG|Qpksm_>g$Og z|4KNnSVK7?!T{#4CQ-DYDtpWSy8Li4w2F8uM~VaHeM{8RRF|32&r};m*&^p7JR()$ z5oD+vTRPz5x{fXPk^3G_q+wD|bh?ln1 zhLz6cq#Io&ZYiKo9@q8|Smkk&U$gf0=WtD}x}`^Tia`t+PfYWt9fcrrM6?CnKc4;Q zOUY`90-2hKpyT#>-z3+n?M05C4FRpSSi4$2lu3FEu#{h6%+xJIDrFt~U2ax;@Ve1H zkYq@=lL2szX`jMtCyI3t2hEtKXb=aKsNA#|t#Bs%1-i8U93cwUc((1)SL9+iZ-p$B zLmzKjN%)$F1x!e;3XAcbjHD#CNnn=xfC+|Z==FluC>bo~k0H_1@pcL2n>TcdJHg2>*LC*|{QCC-|3L@=a z(wQDwy8OwWfeQ5lzzv(#><1vinmlpY`Ygw)YyR%Bd>^!#I6fC@N_x)SP$K0&2Dq`d zY*GY>wL_2Z_QP}U*<1-o$l7VAwYFxTVXUi`{bFsy(z?1qJ=EW_hxIBwS}2>9*_83~ zjwnQ`+zhKdE1ChO^k+{x{{6~P*%n)%nur-5(m%D=%7C$`bZ}M2u2H;Y}nVG6{Qx=5YB9j z6&m&_EZ|xh$FWBi;DMm|aM5*0Y~e@IQ>O_q9>pcGQ5TcxW3W34I4Y~3F3oYvMX;U=n3betR}KrXQ;%_rK9Y#?4zN61Y#@! z=3Sl~O>X`V08>D$zh<2<7t}!fIGrPfbR|Kg@YST{X7-&HPgdU(Eo?-!zKKGBg0w(dmok1Xqsl9#V!!@y3gYqACoj=aATAm3kbD-YrVRdo_7rHbvyNs(X?WQ zq-(&3QWlGx#wf1>ODX5y7vmqx#N~)7bXyTO=+Z5d%B_+|#^h*?itkJ%+kl5&vJi41 zG!H++$rQN+q}?dgu2K;?#2iyj+KuoVzr{v9uJdYqh(39`_RoFfW4NSl+9x;^gj|+z zGZlhCxSB|CT(TB~eQpGXPDCAfq#=X$Hx#qv3D#VP7jt+QU}^Y?J{8}3{!EOHvV3P{ z&=d;CZ+5Ow(@!ad;;*`mm7Gh36ms1`$LDv|L9O%=Ovp`|OM-A1ZfrHr=@pEXs|W;T z@|HjLi}Coy!js~c5>4?m7X^5597JNOhaNR=!ecX%x2%&8nR1hJqu_q7L9EV{tRNaU z75H}2&nw|Oh51x%>LlcfXuS;HvIpBzLVl1EAr*%6d>mXqI;~n(zt&T*to}@hgC+GC zB6}WzijIx~xllplsyx=%Gix(s*_Coo0=Iv)HrECHkJ2d>^9Pf?Iu|8@<#Fp_A~X?1 zgxQ;vF3kaj0~&qq#R7kl7M+DJnm`z14@yVW;D90AM6M`> zYs9FYG@nyCBTh0})}DJfpxnPelmYnrHwnG}pq#)M4g{sUf4eI@;VzzUrr@4lR5G1p>$_?^+BR*K-8Qs~ zwsV!hy_UbgdS4UhN-J>s=-rO^Hy$)y9Zfl_vHjDR5Ttc122l~Fy!xS8^ZX&aJsuU1d?)(hUx94jVpEAfiGau> z<6wu#=o`pp2v&nUa$!)Qu-s(2fn(5!e!Nx94WX#tl*9tNq*6~+b85p}cxaJrjPq>M zkdPwg(}rXklii+(`7d%PQycBigvL7mr>q)LZ}Y&XN@d$dfE*=l=FC>fHjmHTn$O0p zWbc?sncMfcu7+`HlI!A|I5q`NfKay&m3i;Z6NPef z>HvYMo9eOTuM3vM2DWsRg`J+4u*<$SH2dCibjB%YbB7QszF@^caM}9PyPr;z%qBzR z@dhzJaJZ}-0KWdQP(E5ebtnMFC_apoXM`(Q*isAc1fDuHEko5mo;a6!>Dq0>tFpCk z%jQbB^Y!ONt)%pVCbH_QM^$+#j1TtkmL$CLL+$o$;2%#{*jzB%`nwLkmTY$V25;(I z_bQ#fp$WTQW_hVDIyPSqO z-tZb#htUT}-S6%(bpObLdA6OU|AfnNu8j>gNUU-TzS8a-{eG3EX!*X(`J6j-5axhqc=mJzyK`=&Qb?@d$$E}@ zlV;YO9>U}5S*^??4H2FH^~o@{T8*t&F>KZ@ZIi2=M{d8N(?cWS&$JIgTyGfWyPDF~ z{3yLLgS9jAbdgFm!|)VTC_Yl?L9rByM2&_j?MtKb#A{A8)8C}Vtc5y=V-qWh=HP6A zEtm7Z?f`tu~*Ra;3IW|d!M zslPnkBtMXyRSRUfgr?PS-zD00q2?(irP zySr&(0-xr%;?6!i6BYQnsTtr8SUmb#JgO&DHfM#=9V)~5j{2^2%F6`%VGT*>Fb<4D z-{9T%aROAb11;j`*Pm#Noh7qyotbuIb38iRzhEYj=_(qiSODRHWC6*kVNU^ATdX6u z?6YCobWsL=Jb>R50)bA#ZZ+F}4^_Gqley|Ek;IKcMV5BxtOVie@9tLe$TouzQq&p< zUeKZ$zP4Gz0D2h)M0^O(!d|fsM+R(Z2LG#y8(NDr058w{5jKmM6f4pht`18aLt00A zrO68ac07}aN)`wzSy|$dzU>Mw#;QsOcFA+eVWOy8EgsxD!5^#QVwD|wJ7BttO6`D1 zSPwq=T(Pl!_-rLqFS`a%_#eB^nGQ0)kVq2~P&__XCg0U^AW|(Hp{~U#8#r$1I{4|SBSmYgOz0MRjHphKYs@|kl_J|*irI6y@ zIg~%`YKl@h@E*E^r)VumzbP!vR>mqvWwg&27QO|T! z^h&e#ahq4)tO&S3mwqg=8bnm55RsuMe(3OOh}OT9rF#H;`ki3%DECUdw;zfEikh-S zrPH}n;^a9EHO0y*l(%IgjXLU?UUa6`A=Pi`uFs;o>~1>5aHbI5Bk3|y8J^|H=+=xxX>S3;5b zt5>4aY8BuzYq4c^umN?UnU%l1hl9#gYa5G)SO3a-vQ?K@#~4TZisHY`Q(-Lk0%Ux3 zN#6f7mG;1F3TU_@p{ddC_z!8eFr4LsX-w@iG>`MzKvII~mw+^V)`~!HxyCww*z|lL zT`4;4#T!#*{|fnJ{d2K!`X~`Cw+%Kp)h7RmpXWD1lJMe>-GTgH*c%pCk=0>W?dw{^ z*luarPb-}VQNp~9wATJN^ayEM7Q~-MOL095qR_kv)msol}eJ&WX6$Q zc5uBvo|8|9$JdXcHf#8K*lq7y{*^~IqgTw`3~RhAUJUHh%%Igle2~=lJkv|&(HYE4 z`AB8yW4J!llvoLG0I0oSA+8h!P>@N;o}<|!PSa6`2bsu#{&u1*%C|_hRMio?32jN< z8%k?2jawo-$@)1=BH5_`!BzQGa3)^Y_>t8r6RetvVBbjs(Cj4Op$6$5VM$_^qLhBX zLjVlfx;`bj=m7T6z7{G4!J~QX*(E3^vRG8d4w<-hpk~BtB*xMLL_ljD-~hnm=B7Y8 zzuhd*Uqnsprpc1JX)?EGzlw$FF%-xx_?B@9os3gKFd%?&>muqfTEcEuzH*54w%zdS zB><=RHCx9ifzk}uX7pR!P)n|Jo=7qkO>|$S_ElfZ@rAYzWUA0#3xjT7Oa8PRPp;L5 zImIPUqb;Urxv>|!9-v~3?25Y;eisXw*a#!5s9qErKOa)a+J=ujlN6I`l~V#Q6uAZh z$}oYOA+s3TWBBUO(jS*nM8hf0H>r`OgQDXv?sUdLfh;_E1#w=Qg9EaQe6yxu8c+od z1UHA_c>&c`Fa$w-nKs7as>9Fywv!Vr$jL|qjQ!2bZflJ*)Px$^mH|G1_mM27aLf&X ze{RUY`}^z(4i>Jk!LBaaQPVmJR)v?2D|Zm{HL!I;XL-NsJ9>!p`i!NNHPo%O?4dRb z>tk;W!~eMdJ-EPUhr!UCLzoYtMMV{x_dbKq@xQbmGP>)-KM!GdDbk+ z68%0kS~WtR%&fx5e&wUC4(n44KV{G7@jPtFi}C-qu>QLuNQTeTZ_5RTaLD!1s*LJ* zs*C481SsX`XYNlsCj1h)*HVwqukM61C%ozDCng@zM{!q8vOjwEP+F{ynzR)iPIh2t z%V_YYWDPQs{-e5dE0?J@uC^E9SgPu1cqlKSG9!YYasR4wC=6l=I35hTHHS3^pnZV@?R9NSc|$1OZ3_WchTeZj|D0Al^nvWxPN)v z-AEm=Z7FDazX{g+n06a~CZWSxW<1fHcFV)QN5)ZsdkIc}i#*L*Ebc@O{28i^beP@wsoH7Jp?&}1jEOW5m<+sMk8g; z{nszd$YxJ;G-_l*rH#atrBnX{)$H*%QWNSafzAKD!^p0<;mbuXYp_I(4ybkbFyLdU z!|5mx@<7SlIk2ZFxWmaNju{#mIR0i_So(MUlgUWO1p_RQnjj%#m%XN#dAaxgGVKq% zPq;#ohRT609JlSGtKiPN9&k})YIJvul`%&yQ8k)|qY%v(6tRljn zy*2>H-Ws2ZNO>$=q1YCrvhv;g_jC5L{zM-j>KAeB7`pC9da-OsR#`xBcT2*I$wWR1 zEqnwL8r!|=A0$7Dk56It*|FU5eM}5(9()b<41r?drluoig^HeJ-Wd{V)lT&x*ksb@ zILVDYl?TW@t()esXaK@8dc)pRVs1H2To?qBRtLcRm|O;|Zeg28kGI!VUeW${Au3QQ z>Kqc(6-LDaJdxU1_Yw9SjZC1SPQZZ#>Qa0IVP)~XVB}0xGzBkpvW1J8!i0l2w<;Tt zJ*^_;$$Jx1S}4Lk0zd*qV3D7`g5X_t)qE-v*r=i%YxA3utE19hKA~o6DR-jzy8bv7 zIp|?H!oMH2_4%%&X2w`YKa(K*h!+GVRZ<>H2<_KuJlDowF+oVIX-&`sv#be7tuTap zNbRhLt?v*6LYt!QmitZAB(&c;cxw+j^|vBC-=MA^F)lk!^Na{G!c`vDTiUVbF28B` zcoJ!r@bj?#z(q-dv_iB&XEWnkb7QA(sao$a5_EGzTkFMCTz};%*2gZEO;8X!IC{~2 ztxmO1VurFX4d&QV_teeomN)LWeIfG7Q#8!UX7)4j?sSc>Bzdx`^d27;#e8p~r75w3 zh{?g}Vd;7kwFw@;9YW^Q(Xu4hjHBbj8^?yD1TBs;9%MOJ5sKrV7oPN~kthfVyNp%K zQ{Wo5zqmQ0cc8R8?HpBe1V(sE07YNa@*iu{L^HpVNsDSKYRShlKQoY-KS^l8+u>+S zm=HOY$9?nF;=l;ELN~plxvVC6C#Ld$p3dobeU1Z6&ILe2kN#s%EBu-pF;@_)9+tw|O)X9w*#)E~o(g-5 z?z6Y%jzLH3!eT3{$w!^O<;V8&0|mg8j?v-UHbj7*h4lj)!D=N6wezk4O=gQXe z?B0zeIPM_s+MUzEbpyTnmi)8Aq=TI7gC_BA*a1+jmg>Vc(7u;0-oTjB=R?n;SZI_C z`9sjR*0QGx(8*={2OEbg(e=Ln6pNqK&qY~HEysA;S0dAyMvjM=Ri>@J3nnTzA0e9N zn4j+B4mXG1>7|T@9V&?pVsvcNl6^ z9<1@@cL%2tK2!WfXTIx!L*At_o~|f!^w!v>n6=Nw;=L}%R*eWJnpobrifVv@p&7sO zX^dh!cUT;u9(>M0{UIISj#9AIXQH>H)(lCx^EZ(#nBdxnWpO?Yr$+%ZeLwD@vo-280jv_OY4%UhH`~?qUz!s?v5`sDZt_X;^=E{T;@Ji%yC+vlgh(~t6@C~7ZM3CTU z1{UwP{9q;Y2KZbh9}IXZ?6u8;A3nq?@~L1syf zsL9*`iYC^^bTyglkI(F%fteu#Wv#1aHt!q8R`wSlFO%BOkPZ{So0LUeXQMmYiWfzF zPA3u)3Tncm=l2ovh>Lx>ED=FNrEPyYy?s5yGXIOr+X#?Xm z(8+=yWre}KTosh`qQy!p$}Zm3a}4AM2{jEC|A8cAi~d1&^a9K7e;t#ee%6cd>dXpY*k_fif zFvbLcxHsk-4lmn_SQfb2pa)$v8PjeX59=UM@ayjslz?;i&b{|h?u(-gZOpmj;Wu#^1e;jC-ph z>#595o$0~FYAq(i70}qsz-neZVp|#!E8xABd4P}xY1h=OGWt6<6yP!3sjm1StpiSec2${{hp@b|M;Nmj@>o#2RZjAk!4yj}HhYKkqXo)wY|#DHzx{Tu z4Ztjk42qlbfkw91vbJQe1vk~3HsU@F3d9y*h5a(fb>`+uM%1MxaDu=e*5G|8hk+_W zQbbFzb?iGlx>cARKImOwB6~+P9{+5{gAOT$IZy|sA@oc680bh~1)U`)=6YJNY^k&D zR6I?Ti_XuFzewM+pP2PPRuKlt?wuV1h77;pfX*@2S7r^Te8yzORbXCz3jAvZQ3GYz0OebaWiU#{wbNw{sXwpnGI^S{ARQKS^gw-a>Hs+ z(3nX_(nQ?gGDylfu^poGX~TeKV*GTq;-er${KdUHo;_qLL#L!s<)i%MJ<{Z$@J3B= zxvyWrFu$DMOKUus;pRd*Q{TfR(%1c_+0$4;r~9IwAPxWtCzD9^-d+$Zg0M<6P@}Q= z&kjnLvOZ!ky-+1yM%!5~`QK3R=&Uf#Xg5LhDdkQ%KEg!aj_sL?ZsB{~(j)Y{8&%*caEj>LkLsY9|HmriDk30?ebsDs zfQ&i8c!2qwjkgJ})Z@(-WUoIb!VWs`!%r?@NZ4mQgInltJ)gqP(E`EW<{92wo{m54 z=E2kPf<^o%pOpril?>43$!e4EG+3qWi$_lYoSl#O2fo!~U9^;$DiU)b-JBX2a|#Kr z9TsM}zi4wyFKKBG?RRzRzvTQULlUdFM9l#V+Dp#B5S3qyLdL4m`M}`VP}NhbP{8(k zfP$`Jcvn#&kKLHh(HPH29Ebs|K7rom(Wj?K-^HJn%j55E&8p~ziR z2^lK<+M=%M0$x4qsRkKTWhaok!BS+~M0Ewnu(C&rJ8UZq|Bxy&QzC@rkYmylg8X?w zQW|v#BN6xaF%@cjcPzDa4U22lujqxUl{mcXdYI175$yte)^MIok*ioU5mC*0XHH00 z9aZY&^>yf*CpVfbH~pDsRhI1k4t4I!C<*ICg% zIjCmAR>6(Wdb8HIE>^sV3{k1(&_d$4O(QT77x}$U>fAkOzr8Y{S_y8ef$xo$52T2J zE`kE%ckRkvtK^@*zUY8CWv^ty6V9K;y5tq(WG+M8GJM(>-?Aks;f%=XA*y6=7l;hx zLt{3`4Ns)JxHlSTidP9`aGjq{)G=2+y|&bv)%6N_AxFSS=l8a&UihsleT$VK*u@KsQ9HaSqu5*R@GPTPPDm_Cz3Bq!MpP*j zi<1?zRQ5c-p8xqG(9A&;rC4+&y^O~uf?{Kbo!bIa%u$yNLTx=hL;yT^2#3ND z7nY?XUpF&&{M<T~)=aD4uu z#LBtaca0GYR>s?W(4skoR_^$S+Jv~V4sz^o25wkFrrD>J+q^&ac-uZQ`5G)RpyYIe zO;7u{{04V0k8}*>Sup0$0XbSg|v3g_m7G}%;Iod##I|5`m+Q1drfn(n9b&||CF;Rsb^q$k%?l(m(}G8?BUzz7^khZ8$LIAg55I-14RhT z@!+gf<}7t+N5l7;S}IXA&=K?1u5o{A5oQi$&fp0`p&tRF^MzCQ+&rBJ5K;=floMfI z0f1Y-TcdNBAES8#q=x=$@mW8qC5z$>@A|~yiK^mrx-uK26s}!A>uc>LR0~#K8tBA5 zn#a-J!pKlqV5q~Y9`J!maEnt7iJ9z)7}?s{H28l8G6*uPvgXUtW|{wF1y4SFjBR(Lutrdnu~(-lV;27Y8A{{ zsDp(g6H@0-et*ICo)vNN#QMy<@ zTle(!#X?py$smy3ZA)Y?)0})yn45eYdXXOOGjpn^u$#rqqA*@Shylz=&v9v&R(V!PNHQRfKB1*YmIFMs;Gi>bE?BQd~viA z<@4$cf($sR!#GaX0r(X!H^Rz|q*aEvjkAN78WQk^wAJ-FDwl<2W8U*fjxsPZ?W6ch z_*aRo%l>o|(Df>0>7T~T-~L{Tory??NxsE~5^YB76TlbT`wia5CGQ(r=14w?1bJ6k zi$Hu=yi;X_c;zZ{@>0JgTJYQALMnx&Jwg@!)X`KMsQy2NJ1#lF6c0vmpkoSig)-ly z)iE&+vDU3s;hAM!C|N&RQ8Za^t;AIi-y}|Ywdklk_CoU=;0y=ip{q|np(`FXYn=w= z&lTZbvQWx3cPlS>6-=t>3<&;Kcb;OQ?kKoXM+FM8b+9qM=YtXiFt&F#*U%esj?pB> zFth@+I>OX4JAxOhzv|?u;q(E0j@@1Sr4Q+(<|GC{XhPEAgt%G-5!#9@{!HY-7()Tv z000V6K>`V)M#S%7a3L+~EALcDRNBjYve83mZ?TPURj4-=M>EZ>JJFD&$q5KI{R}V7 zjHHs@fG)~ATfI6-eLt%&s!d^T;1KwIW0pDBzOk)^T|?6G1*b)9y5=yv$(L%Xci#`R zOI#Mm1!hz&LwIb&6IgxVL^&0V=m%Fk;on=ZgWh3&P>TA#3m2Omep!^5h}IW@-p7Qo zu^O0L|AhT31`_n@^R(J}w;n4ebQ`u=&~OYqk7q=yoxzib7I@mMdf@rV>!8SA(R3>c zGTL1J6)2K6TLnybN-R%x1cY6Tx`C@wpu)~#MBcJZfLoN2TGCz8s`9Tc@m3e@kti8m z;8kAZ;jXLM$zn|_WyOL{F?}3nTmg3hY!jZ|dk71+eVL6}9et!#xi(%c@3dOn8>`mD z_J!!dTS$I_)K%PnO_yjHRQ=DYMw7{BwuxG`6cYvFXk5O-K`a0^l^}hZ$5H`h{##sL zsR?7=3$Wgjx2G*$w;V#oE`RaK6agU=gwBGR%SrAzpl>X8p@Qgna@i1s2OQg>QSfnzpc(M65==Y9+6lFLh!!z+?7XiG6I04jt!Ab*GpjhH zhn}?RutbkojQvrFlLdfEfJkujTd`->rDV1SpA!%FmZW-H2Io7dtkFXB%jiBXNPEx1hP5*aFP|?|t%75TtIz!OvH%VvBNWoE+qKj>|@+mv36x!WrP8(vg@@s|}+-o9Bq;qr$8bO*7ghG_GG?vspxF z?*9cInjtFMQr59d8DchxY|(dbh+1M#G29yYry`EBvaL@M{AvnG;Y9}!lGxD$M8|Y$Oy09z zpeU`*Q-gRMZ*to^VbWbZ;Kq^^Fd@DSGTM^a_U7D1k&cG!x8c4o9RdO?iaQO-P3yEF z^uY3u3&wK&{f0o@(QTZ_6I3>`ZX8Keg#eJ-aW8y#0{}VvM&67N}UBEbWu>2S8N7;iQ$mTKp6vQ8DICHvoIF-UZhIM zrSaoJw3e;*Myl3IVPm!9y26va<$bYLRWkK*L~@+;zTT0wg{k)*mTgtuHtf6OgW75A z5FTIDg(-x=-VHGfF|TL@8f5BnF}R_M$e@M5p6teYkfj|e3J@4&+GhgIH1fFdpD{Sq zKk^-j(J@7;;Dhi35K29%w21efg@Z-GILU;>pNUm$u%ZOVfesVvQzuN|17kNr$nKiqoo7BoA+z++`V4j zX9cBttu}zYSvs{b&_Ackn^dB9U9awx(?M`EEy+lF2p@JfgyW>uSh}t=WmS zRXt9adE6HKjEZ>Fx6^TZenAL6DLmrG4Wx{v_a@PwO^AV>fB`5pn3@wQh0ur1btS#(Lr{fQOq;nZaZ- zhtL$!hgfP;(0dOTRvkPx0NXs*ucZ&Nh0~z?*6-~Qh`ct2I)1)$idXGZ4YjMWZf6+h z{<`hak;0h9L)C+DMJ@<3Ty`tEu}&-yIY<)8dWF(_VPOl$+?^Itjoz5>X3(s1H6^P# zQa-wMk*>{ibwTx8@y?L0?$tT+3g>LKI&N>mp#oS>J>2*TNG4LgoyhU(F(qF&_eBQAbU-IRjZA?3{iA|!`rY->eksv!CHi>2e4BBge?-N z;87zYI*!J;sU8U&tc5O*bfLMbk}NX9-dD&X?Y3eE>4^XT7Q=OYhB^1o^LAyNAiYx~ z{z#!BPo{d5$e|`yQUhpPMO87T9jZ@rE9YoQA*(LKcnlNOBZu+qkxl zrtCpX=Vr#16I-c|dgvr&5=>-euwVDF9~7>Ow)D3uC>SMad2Vut^{xP(YwG z+IY+m@?jG<99I1>p3i(T`OyNHHM4|kT4?kPG{CAZw;L1StWG6(i@>^UZ!SMGUy>jS z;6j^LL&5$s-Hn2QxRwcLGb3B*>>^&lkz{Pp08L-FZfy(aaGH{;qMzwJa2k3vFZib` zpy6s<^ifiQ(#d2|ZP*(7zYh9L0D}BuzxXJo^%(;6N$qWzRbeAVkX%vi9b5olA-%XH z(F+E2(5$p5T4p`-Fhl7zOX|ZA$G?Y3le(%O5T{SERHU+aQqY{)mVx{`uQ_vlIvqRL zeRHDPR+gK(y;5Knf0*jtq{%BX$R`h>`^x>)`1cqTBp&MK$xKMa6TVzWB!AcxiaHuY z#Z-1rD&KwWS@~bph#0x4MMGKEGw?n{kEjoj-&W8wFLP?}SMNGgdiB`^&N0ymALgpx zOCtimG}ZU=e(BYu!tS$D3rO_UK22juzn0y6gFx3*!}6tHcsRkza^Y2qrT_p3xIqB| zXpyly|6u=K(av;BipeLZ#&G%AUHR1`yUg2fqyz=c-{de?cxZ{=1jGZ{$iSO z%#9C_ukykw__~xW;ZhSbYK*^-X zANFn~orjo?&QU7K8a0K@kVrcUc3PkHqAz`?JAtD<)e;tRp9|V3xiVF(Zq0nPT+39O z0BV$#tyjG2tkzpXk3M{d&w(DxKE7~wH)nsP%jQz?N@Pd9r?w}9P={}v&M=d^%tA4k zYP&Aj$^B7{jVwomJXz>y`G8jQo3)=>?xfZRW9&@CJ(#{kkDxg`H>)xoWlG050oqlj zn0k0w)Kb3x)r3Mf^S^?H%?aTo0P>o14w%)6=5z$O7?@9W??&Clxu4iu&3 zgluQgAkM?E@V?B6ahMt`r>C2{2<2OA-sg$*-36;U553HYuoUD))z4eG7x`}#Bus)*&Sz>_ zT~t$&gC{=$W~An~T%@>K(7f(v^ZgoJx*p;;WJWR(VM zbX&@D5X|tZTFJ=Fe^3^h0D<*qK#AI`3tH)o=dzhQx8`w6gL?UW)>~5}Hs5k_PqhOh zkmc;c++Psv2`9s+`^Fxw3i9&kQfQ7Pb5E9`z+jBtw;lJe92*6)Syb3*5RnX9rM?fP z^C{s2FXnByaYO3=>xIOpqAa~+1-hvx4b{N>uh5Q^0KjV(lfE9s*2dwZ? z6c|psRUFyFMq^$&*FUS>X1GG+#xjgOE40P$9EkNG4NG^*LvDa%RwRZYa2@tV@EiVA zK-$Bc5mI2hF3?h4;1woYuuu1^_r4{3P`LkeJ`3XA?nTgm>qqME^U|ETWH?z2`mdKi zs431v%NJlIG;}Q5s^KN!`}%UOOZ26t&QC_jp}>?GfU6aeF6eE(R>*#<(TF~$iY^?d zdLLV{oCLEZb1I}%MA}?J^BTC@voksF753X#wrWbOct}LOIl2luw)p!le|7JK95|fx z%qJ<>DhRp8cJ+bi5h2`u>aw@|I&OS|{zambVOoqMpko1A%=Z2j2bV&#%$?m4YpW4p zWLG|};vA8px~TQlA^F1<5~~kk0XA2EcSBWtBL{ctCrTw_cU;OOV$O;rkDhPx^WTDr z1eE&?($uoBg+|ktL|;Wn1iSk`T+;b>!dyGsFmJfOU`NB`CCx{<=t zFw6aV3D$_KLFql}aIo04pwaZ0RjLtWuu88=a%!Qi7z}ev6BWLY3RsQdmMC&Yu>Alo z!uL@Kg+jscZ8sYZx79e{p4N3@J)^map73D-MVUtQEUjP@xJxMIMIhy0Z@{m)cFJNc zpd+B_l4Yqu(T=ivs+wvqjJ!|{Rt8vr9TCZS_J_+>XICLcAJHcfyH`GgHZ?g4V~*nK z$ERR6rD@W@0^C2UyM?gLMOeZikHq{x%BgY9FBQ(=scX}y0oou^?;zsYS!aI6FaQ7t zcR>MyXpylN|408$Zatop+viUo(g4gFRR&uz)&!5U)%>}j!q{?k4p%dWW?Dr~6pSUg z2a}JQzVb79@FwYnoeRF=fg%ZoPAtny{QQj|x{WEw5)4rr%rnu*;H%6h4H>ujD9HA`wU9s1Rl34>7KE zeP}EILErTWUN{-6!^WKoZK_pjdnzuWdHWQPB>sZ@y2{ar!;)zXVJ(?(c6VnE7KIn; z9Er7izX@U|O96bBJWJaaY5NmB>Uy5k3==8}_Rc3&(5272y@tOH5q+@g#)x0@v~a-y zu~qUEt?fal@Ac}q=&@1=7dHui0i;!i;6GLd<##F4yIllX7&R1#`iG2KK)DY0Q;y$h z8@Y8H?t(44GFMLm3XkqyG$&Lva^G<0ztZU|i6I?%V_7eZuH?QkvL*@Ylwu%xMG-F|0bxWbLTXU1g%uS4+bhrnFMu2bR92_y^ zrh|wIrGDt<*0nw+-)!4Lxnx%rj4<0=S-*3Q1tb{x_Pa_$3CtS=v1N;PPMm1cvA<+) z8Jp(;f!5b=(?axXF3%MBY;iXgt|vs08EyVmXXaWX^up${bZEB?8*2HZ#gpV)Xh4~^ z^s0Pd-23(Axy$C<=md5HTBm-Yw=yTOOI(bx|CjOnbcxfTi?ScDeP1f)amw^|OlDi! z(d5ja0;tS2o?(C9igP_}sJW8yvU&#Puy2=ms1|Jwa) zN9RI4B7#%;AAM^W0N9ueV_FPQxgN+y8J2eqKTbim-XIBycC0QYs)e3RZOxRfF`@Uj zbYF0bc3HpAsI?W-)&hMGgSFK0kRSs*_rkAXpyk)Uupi*l| z#kaw~Tmt8B53`YT5TS#Sa{f8(dJoC;AJRsss>N_IVIQKD015^VVzc;BLViZ6aPAlc z(#9*EXGZ4NzSgt+0Q1WhVamJmh{(l9Zzl1(n-sZIqmwK`@nK-jSx1+2uhBT~n z_Mzc8YdPdPj|>>@nMZvJ_TmQhymF3B>PPGAPxdEivs!meFt4xThx+OZHG>Is+l8n& zZ9uRL08n&sZLF$l@|O;i)O>qI5@xJe%$3_J{*aLDYDd=Y3X4j_%NU4AIADQf{%9G( zG!rTXyHNKB5u$qDGHOFI5YRuV&@y3+Lf3aKRJ{9RoWy?*8%3kALN-|KS1;F>W1}!= zR2ojB@HukY?39R6Ux7FjZM8G)3U{D#v)wA7KJBjj5gNWhX#Jj zm{*RHIkbyDXV%E7o!)r02r3Vnray3?JZaoGz7QJUoz9r$m22D6yhU`0rV!Rl%Bi9n z&VpgnaZvS5fxS{JqqQ4Y)G^^!lYZEXJ`*|A9kTC!qekaIaVDs*mdY4+5ysCN}qJan!;jmwhMrf~7(1W@XzY zn86B$`5>V5LiVv#_gO6JA&P9@W! zqWwfRVoNr!!iIw`Ur(IZeoYJ^m)SwPnFcGJKLkvnXblVieyhdU0009RL7qZ*mH+b> znCc$I%#kf%9JBvkTE5??9zOU3=Lh9waa2g<@02OR`E(!mx>>~O#vjGOP z$J=*Mj%$scHkCBy@wHcSL;NgT_YYNS%N&@*zMAb?CgIiG0@llnE z4wQ8rVFPe~(MhtRr78CFU3xRC-I$j9uG8ynbNMhy-ZrDG2qDF<=tPgO>dc}bb3GYX zl~Gum$U0gu@dU$kKTa7BCQH*-O#XsQ=zuPrv3un>A*GEcSvT=8$63;u*#+|@ySdck z&iJoF0xFD6pToa&MRTUt(VU{&S(TbV`RMrIYj&{`JPHZETSHFQamO3vUvb=4_2NE} zt-FBJub>x~Fu{448AIC<=FLU`jf!o)k=Vj;htO2{ndB;4_eVV)`WnCy5`KAO{^{ChtF~ve%idnHdCVKKF-(%zDoytO-Xb(#jO~(@?G< zGcvt-MHCiI3W`F6c@p;%WXE?FQ+OISiNF_i-vny7ry_`Nrr*$Ix<)i|9m<>5gL16+vp2jx^Zd}m=THW^&~G{3hi8CE4Ds~m_?xS5 z@%zFpoR$B)o}PK#-^o;f(xlWTYKP^{#KwS?d6IavN`m6$$Dw3P8C040jyqG$HIpnP zGaAoFAw(`u!Z#f207W@f{<$%z13xtjR3K(D1g^zM*K}=Sz zkN7XAQzmXV4aT`Vybyn?tk~^>+0$05qkRy96xz>5cegC}KZSTdq(liqm=qxwhi!5X zcUH#JoZ2*(XvsfMHcQNXb$76URGg-~P09W97oWVlOx`qr<0r+q*8+UoisWxGQdwH( zfdBviTR{N=c?sf2|4~$$R5!yT5}C;Ai0mo)wVx!KJ6j10ar2YldNmWt_-Tm2WG=&h z$rA|MOF9vV5u^ff!FFOFX2P5;1oB4teO5oc2s}m?modGS>?iOfS)?{w0001+K>>n! z3F4oKLcT7fwBMSWG_vB6hDnl+Rn4*jnbOF>*tRAFO%E(i6~>80;8+!5j^Yf_%DZjF zzohWf5u4TqSJ50N3R)K>jhwI_sNs{IJ3PWqBwmGB0EZ&hA~GFZLGE2$t^2QU+v&=6 z8bQ06|15Ps?u!)rzFuFQZ8W2O=%Kk^J$2?VADsQRcW(9~$YRBcaYp8ZggG4dtFUygr+?5vEXhl` z7r(M$7;)_`o{FOB8~}7&Wulkop#^N)e~G%xNV3*T69*sU9}L;<&(0Fpp$ zzs&2tnK%~~7q4W0ef^%R=<$eoB0aU&GQrADzK!Q`?QfGv)>KL}9GShT)p~zYb;?Kz zB|2F`{sH8$!#Ktt4Hr+S^XI$cd z{kY?;dSn)b1hT8EJWfGs;bngKj@d*sN!4!a9sa4yV%ubUP=ge7Z0MQF9LdL!Vu*!> zh9(RnA0Z!u2R|jDQqBf4K(B7|;q)5MNz`gNvPXW|ju5Xk{2uIg8}foBVN9FV$j*It zAA>5F)=Lg*SHZt8hwqy_YQoP-e{CRL+TAr zL;s@?j!lhLkOV(*DC+5ZHXY6n`e|tFx;us&?UsXOD%MN>^5RAcEV&_=eW=2Ptmfyj z%Fg)q)0yJH={dQmoZQJ1?()eh-G9_l7HN&Lb0SuMPj3jMA4Xs`)5Bv#tWEivExPYV zgsl99@~CS39=-uvB(BnrCnHW3+CL!Lr-xO~Tn$iou?q{&1o!+K*u7s)e#Z^dFBv9; z5~YS{1C`hL&r~!WXVfCxTUZ>C=$Pdd*>lgwH9n_AW#e~mj{rSS)tn|NfB*mh6afJO zd4{g8LidF+PA2z)G=& z2GD-@@0X9Dsjs2_BIApjqb6gpWf^2mHgjv4sx zmR))jWlP#5WTUnfhf~DdZb!}Z)7;adz?aT`cSq@w6CO7;j>Yr&pooEtJ|zM8%Lke3 z%|Mq~;mA6AH;FuxRq>J2Ix+4~4VA*0X#voE?_=&@>zZmY_Imn*EZA7ca zK_p5gxe=Yl9i2734n zQ}|!-{n9tyB@EneznO{354xiK3u|hGK)u>0sJnGI{eW-f_LUgnMox3@+$nEgHEuVw z2|Nm`+70Yyg~To|RJC7ROfgs&jZRJ*o_nqs%LZ-n5u&kQm`@n=Hok*k4c}w+0vf}x zBj>t}V>R`ThF^HfSflAr%u)|VrdhGzK+`$88Trp^a(j;?!bZJU{!1|k#IE5-4jNvD zb%wF0ucKXk%9ei-k~|ME^p&f__1$RmbG#KWJ&SzNtE9Uas)$mZ4n|%E|4Nwv001Zf z0RnlAsSyA2kAP@u{^7SrmqN;Flaq=unt4fGc2ts`7}^1zJvB)H003wK0fKprsWboA z?NyrydswsK`A0bvu=*Ua5@scSpyh>-^4=fiJ$q34wy&JrcMvHKmf1BadCiW0{*vjS z75A_hO?(co7Z!$9mF&UkduOOrIgU>w_!o6%>%(g5_Z7nN>VGXu@B}9S00=rknr2C; zL1>vw6!CAIt=oQ}0T0G4e^*rIQat^Y_20EbgOJJbE9`nb>!e)wE3|UaAwsdN9Z}9NEAzVw`DW(S}s){|#a8mi9 zIyv64NTSr4n$st5@2ibd*>VF(2Q*ufDo?xTq!K^cEtCo&>g&vANTPWL zeG*L!9|IPeKNM+lW&k-$eDp6OP&5v)xQ9bFea@}Dl*t>ooDf>WHFnezax!duD;C7p;MEa8Hi+*!yCbG93 zc7${L%^=yr{ccgpO$I6nJ@z|-zrcZRCeUygtr|I}B!(wItLe6=)3Elxa8}GhR?G%g z)%-o&Zcij@VDkeI?%zMyVsU^o$ZIgau{xX*1l|wY`x#Dbx&q>7eps79$v*qd-Mj1E zj5<$w0KjNMxzGAwf+yS{zj7V6PBMaq-J$jnCi^&vm4l2Y9^W7PvbE8`5UfWKBx!#B z{ptkf;m*4~B*INnX42#^)OgZJm8A`vzzhgPVR?WWhS3bxz8Rg(=Xq=lsV-b z!h&~ts$dovsYk~QD*n+)zI8Ae7|m?FhUfihJW%M$4I-FD{P(gSQ^>E0^LPCoR`nkda5=&^>jYXB>&=yq&Je` zFfhz=5;^;mVdufsK^-0_*G4!%ahr_DSE6y$O(5gcLtV_d+4Y2zDtH7;`y6!e$n7d=>;4L4axF-)8B@ynD;+ll4e?2?wB(*ii+=l>w_3HbVZTl>> zh=)#aB@iv2f_efVppnyMi*`#Z-UbHDB>3}qhu3Pe3ef6LWs$bVaas5x5Epq&TH;1z zg}Q_9V#I`WZicIb62;fUtPt*moCSoYlypIWOk7DH$ig7X9~m$pY&80Iy8eyH>B2`R zsM^`N_KQ875~eNF-0)I#B!y6`GzVY?^jRN9gfz`E1#hY305!76T;V86WOq6>($8Nl z7f3mHW8*7mFPXHV0z#21)mI|>T!XT+r2Hi>NLC8HgF6Ry;b-r|^$)J-_QLr*of$iH zORrA{%#;EXzv);li>G$p*nNW83D|A>s#YFxn-^M@^>b>l!FWM2z`+)?o-$SUi|zr< zFY7W&$+>#94yOiD478(@wed2>2a%pBT%Op~NW`0i2V zG~z+@yYK|zyw08U-6msecI}1O>4VLRw4&U^eppWZ&Q9UM%ibqYpyUIw-Kmgjcc2o0 zG&=d|Ka4GV9}q=VLjt=53PP`Gt9Nv5V}9t*+SrNIMjJysP8pylm|WC{a=E?tJ0Qv^ zt4>9sY^Dci1ZyVK7bN}Is%0Wfq1ZcTNqTgs(J^A*``%DWX-fJKck3)L_H|9*H>N}w zra~|^f^|t?0itLHUvJD^OB0(FpQ-G}e;t;)->RhIt$MsoFh(*Gr9+CONfT@2Mh+wQ;6)fGLO(G6MW9WIoW5@q@D0S!h8LSUgM;-U z{jRHaySfA1&Gyb=7zJ$LwjIdY%zJu5Y$P`e-?%$l&0B2N2Q~iy1=q6GeSAyTyILE^=A8AV%oX#80F$E z(SwGsknD$@hW&|}xU#3v>)lO`EKk$;u}#gG%wxD4m&GE59e0(NDgA4}#=uL%>N6~R zJqS?H?+;Ds-Ao_WZ1Et|&BiQb%>V!hj6nhkrevCg7KxO>Pb>cdx^J*2#?0gE*HNz- zxBU(iJR@j9JtYF_w_}|h`t(QmS7EhIadejG@6PaT8S&{-35jdX3L8sBrb`qDWll7q zT4@{QI=?;E)mc>v?h(vXt|>N;i>zNg9s~yznRR#IReJRItH%Tu!Tg#m0vn-dI12cl z>;*2wPT>ootwF;b-66*z>8&f~;MJIFd}Z?oG#^#OfDVn$bhynS$cS<)&>Q|TXPe7X z8k1;8zg=F0YEcJsds7i8#?^0Qci-CeOE_NFhYp>Q_;227vXSEGYP?9hAtH+0Fm!j# zAr5|NNk?VOUX^RmhGeTJBq=~uw~nM$(l6N4uk(Hlj|f}t|5ItSPqQoAESIoPE*PJ0 zrPn7+FJ2TSdS4E=Yta-Q#Ab^?bhlio5kxV?+oH^~1(W zxKO!{c5_Muv5do1Hg*-LHy3xVa@A7Rfm2Rs&H`V^@T$SP^B=iC|BSrl#D1O?tZ6*6cHZsExtwET?2)Qwgi)t-3r{2bcOSRwm0*In zs+Sa}YvFP>p)8qZu9N*~)XMghTK^tyZU3b3rEzrZs5CoDk<9mc$AJ7UFabM7KDo!+ zh;aGUwgfF(3TtNJa1pWN=v~}VX2@dkP8x7)-@-GpFCOh&5jw7H>|_`hq1EsN(|G=Z z+I$Y44|#+p%U2OZP4OZzy@s~=c3(axrdM_v|6~ zNKY$l$E~(CtRfxvd=Wpllw!q*6N=L3rZrP=szX>H6QYcBN2_ohgii0*;7oVV(bbz{ zE9-^KGDauf{`N>De2xV~6RG}6T864}hCmC*(o|x=dM*sH9HaJmA_s}CLL4a%&Rq2I zz6&;jl0XIqyuPC zWpm}s0{C@tN2+sA#t7%%BojNEPNyBy(El(iT!Gy-!v}Uuw9uidXs{TiwI4x7i)ndv z&%&@+`H&UM^W_$q>eBK=UsVOC3(Q?efsa46LLMLdZBNxpqI02+`*Z* zF!Ux~JMGury;Pm$MNXY^9O;}88j0GwJ#~kuC{hK-_+=#dnK|KA3bSfsf-wn#9<~@t8MuAtTtE)nCq}6>&Js&z`?49Fn(g&~O3{1jo z0>3vFx2;~P!Tzc3UP+EOXc-KeK71G|8R1*Dfwj$5ZrLW^9-NcoK8oJ>vxFZ5C{H4+ zL*}(p*~^Sc2R-91BdbsO#oj*tS`63yAO5;EsxhtL-cx@Ih(=}O6n=@+_crAs+?>7I z6d+gVHbcA@rC3zT}V$rBB)8aCPfLp#Q zaTO}ERD&I1mz$o>HoP9=ST{M-ufX#ww=qd3?OiS1ZOXpZBY}J*-$8|FV@NOm19y{Y zhJeFZJZ+LIWTfGgU7z%)6qS?OCcycpGcyqKm{HvoF=&ZptIybt=f5{E-NW)|QD0yxaHT)D}(kSWx;;Iu7T}+UYT6ZsIHX z75|qcyu+r6K~n9%!Wk+|HOen)(f)?5Q~gKaGj2KxZ!~{iw-ljc^(eS)RNe#qsasB- z5b#b=W>E??OO}+cdNdVYF-pV-h9^eNAnI}JMi^D_iZDbdz*q4Zd&#IRgW7*|3d+7f zc&C!04Ind=D3A;tKGvFslJ$d*!L|X__XGlhfdv4T12>qpwTE-$oyOkJls5>gLhCIdLdIs$O@RDJOfmHX$oi@>-+A74mf7mEw^{e87rp zgwf>m8*+J;d|lPSB+VN6mW%4Zm){2&B)AK^tso84@2#QG0Y*|`&wBW0q#K+q5rtWG zO8zR#4PjfRxhZ2w!*%DCB#j&NW$l@78WF}weSN&rp(OE&ovy(Gk_m~qAOXbSOB1Vj z9*a~jBSR#{z@=RSW?NSgxp-_k=o_LhBtL`Hz!YUxAyku#`&(d7hKCP-Q&!2xgRgdK zCcoGnn6d4AzI`5yuS$r4YIrF6VTk2c1UyRny09`3npCn_HIH0BdqF+y^G+c%1xuVPzF|_X4~56t2wC2uHPD3a@Y#j` zn@$UAU_FP##NExrE$yY=E`>rC%EMt4_y&=guexTHa;*6?;-V|DpxjLa#%?3j!I)^R z6oM6nUTYmj#{My&WHK$#_z+V_`>{-nK%S!ifU744YZt=7|8uuZX&U|i9|8OcpL?S| zb!2*fNYWj1fbzzaW>Ofx8V#O#g@RdAsG+J0yz(JR0*Yn4ylh1X+K|jd1DU|~j zFo9~;3*IkHb<)1`)l(i0RR+9!<*(;jfQbY7YWE=lvNqUqZV$lMQpK0yN^KT!kX3YJ z7189eqgaL5Gy8>q>FEB+fMWm@Im$@~IUxC^K0OfImb9e%4DU!jdd*aOoSKzn^oy$vDR^`G2 zON_ES2@83?Mpt;N40*(8W!9@1S_xK|lFz#IcN{)UV@I*03AefPcH(A#iE2C zLKIN!tW8K9%TJa+6woy|-FBIniIrLutv?2;VxM&>;j2Ol{o#s|&(R1c!qw%R&&|j1 zTN6LiN(jDhiMU(w4}tgTWyypEKa&Ke0(zsVi{R%DI>QSgO;X?E8^~F~=DR>xqtv`z z^Qa7?%vvV}P-JsaF`Vg1@Qdc7bI4lvMJ5nXYI_i10NEllvm6^OVm!c&K?qv0TSoWo^gB&E#TaR%5~YO?ylhkKK=FN&&O*fI3)! zEW%-v-Wphfc+v&J7Fk0qTseaUs(7+TZSooXMQe^7flqKSxr$J|ZOtBRo9p2CEdl~B zb8dxZ&`$Sj-sS_vs#-zu1JFR8L)}3XSgsJ#@%syibd?oohxsmT(WHGP^4Ajv z6WiDWUGBeS;wdK+i!-+1{YSJP8)w7?D>5q(@|MdZX9s_T?v5sj6 zldep~Az?7C8q*e+HEzf4PZy!N9(KktLN#=~2!R{( z{Fmy7gJl4=^S>mVhBJvHSn(1Mx>*j44Z`^8eA7=x>|=Xs59APt*}*6 z6&V>>$%O3v=v6plb8wHoro=@~6Kr3hG-fYhgmlz_m@lI^?4vT~Z3QKJtCTg5G7*TV z@)BFJtZX}rwprnVj8cQoNG0l?M7$edCE}P!(Q4&FM07JdLz<`*LQ%Jy*do0-h{N0$ z(K(0o*0S!*QM>VX$$f2!a0@2QrvxmIs{y?5vhg7w0`%fKp4)lG&tB{%nE=qYPcUe? zIhniolYY7K7m3+9Cx;x+1R5hl>Xx`Q1TnASfFgWZxDK2!;vHGS1^F$JM6#MmfY>vh zl>A5gC9@|C4-?0+{z3Q@gc8WeXwA@%`@C_i_&}z4rR7dDmA#0&{5f*(=+vl8q4D}N z;&V+O$ByjapGMd-k||+kCf+VMt+7L;6#2+D+*AF`Jia7ygd$k%LNp`L{iNjiZBD~i z@?w^sGwWoQgvS!0>8!@-Rx-U}skI?*1q@NDd&z<1;O=ohmbyyLtPJC0vnOQ$YcoKC zl}9Rfbul^=?uda;*NEVXcdu{-*76wDw2;2%P$A_lxrLUuoK5c2ER*Y3s!f`E1&}Z# zhKFk^nl~ZBYxhK|z&5n2>)ci-r1zZI{16tvSqZo!a#ej$cWq1X;9QQGm_-}Fuh>ls z#k=)Df_v9LszRZd(EgZSUh~4KH5DJV0ru_D1U3;L<0DVZw8(ZEE=_g~omH5$h4C!P zN<%t0zX^G#6(>ITEkFd-uQ`MwK}F2sEyw|f^XvEdF;2GdR$PH7@~)y1su8k`AaTNo zT1y5HFS5TyvkiUtET4l$BtUWj6>GghfpJXFhWB|^-ITU|CaCYV>_;5!1Z6Hu9HsC* zDjL~0VE_ODumPThYH!Tzh?CdzAEcTm@pStp#i38A(ma*2QApC1>o8JSdv^rhM5O(6 zD%=Y`)=SaApzol?c)fffx?DG%TbEqeW>85#3Xju}ZI8U3E8h8))85otmCW50PS3I! z2$yi--cyluGsYw`Dr3hTx9B9`68xqJjW}j>OSIumfYR5lo+HuJj5%bg!bUz2Gq3#WndJCfO5hD&MglU80T8< zXD#-mk|SzezdzLS!Wm0T8%5nsSIV$!^m#4E#;b1Ew^gld>RaRQ!_6FXc`hEIp^-X? z0a%O#p&fu%-S@1WzHzMXZJY?$QPoLWS|!K;Y(1pctsGXVw??ZIm|zB=&=p~3NP4>3 zV-P_FdF*|TnEckHfm>tb6rgpj(rZ6(V|$>^*nt_)i(A}kbUVF9p+Em*2fU!;@5Ix7 z6S(;&n${2(26i-Hch%-f(hK78O=$M#Jpcd!MF9c{=vJgd|MI*lD)d4K8hR<$26 z@`--<*unwMZh1x6S3{gY9?-9U57l{1RGs_=DlCxZpJ+=vs^IZsFkqv;a6;yc0Xf(} zK!dA${y3ix7MWt}c(QQ1VBzToJrqBtu_UHnc7bU%`ib>%LQe`kA-*q+#5zG4_WE2d4R!8oaxwWmx(&NIVya}8$;_EL#5Lo8< zF%~%}%K#`cz;|FOY{fzpI`3P@(cI+ga$^dBjD2zMTFI%4XzH6(n|>--m~<0e=_yWs zQa_*k%@DPvY7M}#PNULSLGp3r;+E3aE0ih+P{+meHDL%3cAt@MiOi0{stjzxoaxdiNng)k^6Ri;7Udm!}P#XOj7=9O*)(%zWu# zrMoV7b!T+x?1$Sc!rT~8;sr~Cj$5)o^WJ;d0BVCQgr9%af)$KZ=50lR2B5IW?sBX|6?D%$H2*HIVb{ z;pD|{C~hw#7fy&Dw8xCns^YOX2|Q^HEuHXmqaQa)?{jffR|HX_t3XA!5&dHU4kY?m zvOVC$GusJ{9Wq!XgXi0_@Rw{zuoBf7hZR5_hw zMruiq;ZBIgw7JS!@Pq5&Lve3=R>ro>rN-)A@^w478nqxAGIkYMRASuW@d|AmPpz=8 zyVVWbojfi8;PQ2uf&N##m8eMaX|gJ<6>VVoWN@@)56+wd?W8(n*!t|d{O$UmpRz#x zrGv9K^C!B>E7zC*fyrS&WzYG4@eFsoJ=WTLsRnJXW-*j%eZV+&n{k%^-1{-^>RUHs<_L$@wR7w|@)F!h~BZ zZ|X4TRI*VX_U}4kiM_u6|3sA9--_9&7M+s@Ls{vlkbS`*l-MIKj9`~c$mTybNFGU- ztAnx=oc?dA4V12EQwE_6dks0u&}Y`^9S4>1Z?iD}u3PS0factLfF0yhG(0ym=9}6~ z>lxp(lZtH*;hw<;G7&NGdv`u(WJDFe5_w}^Y7g8)bmd?u1>DbR>=&bxWOmJ%l1Eu_ z3UFlKTwt1*8{XQPw&_$&V@R)`rjb!pXM=Tcp2G-m(Ua}5_;vKmdD3v5Gb+rb)oT*GXm z)I%im8ClOpOla94AAOjl{$PsYI0>ZBY{Nchxi-Z~>|2H|cJH(e&*B*VT4jKPMYNQs z%e4kgC5kUhT1jts^e|`V28i+`f2p(Hx`l25C0qqty?8m-4&S((1j}n1Ks0;U|JXqAEw|vTPj`=Zn@N~ zsvQs!kHzAVOZyS6)Vo#^Z^j*VRLIgBVN$zAp0TZpTX5;0+lsv1sm$S1v8)3ldk<5_ zD!jSZ3-4sI1Ms$RByC4yK(!}Ecle}AuPd|8aud5BDTU69f>E~{it;-)qIy6MOOl~c zgQiXN-JdKgXjm;6S zNdud(_=R5%{_ds39)3(< z$~6<;I0swdhvvbxiF4WkbOS1lOn*CKVDTq&v!+kCuR;q(LIK(|pUB8Yk3m1inT&8p zksIyxJ&MnBn+9WMFLvK@QFQYBD_$7JCQWkndO|2?CAL8T00_!K0tu+3o)D!bQvz52 z0^IUm#9dnQiSJywak0VU;=t3ZGvkelPJn3Ob!*0?JcU^f@tOkM4NToWVX~o-Zxh-V z!{Wu>V(Zt{2Iyu849lxQjQB6+^->icbbzrTk;H8KTiv)bYvz0xm{-g!-yhAY<5nOM zjqEUw)oBQtT50TUaVXI~nd*1Y#;Px)8%$_+^C^Xa?hc;D2EZ#*8& zG|`YZ9*VolMI>IFzW10hyw?cNKHc zQa^&s4$v|0j)erJwtA+v)|*kK_n-5%lIAD4e~w^C6>y3Yo3}68&HAOhBk+Bwcd=MJ zJ?LW8Od8dm_<5P$HZi5Dt7D@}N6+p=?^^*B)R0h5Jza1;e%X!;$OR$M-j`}paK>Fq zE(#}2y5pK0=dF7#Y;aXEUhcIF6D>}b3`G&6EQ1!E_V~@FaAq_@qE^sJuUNg`EtYLc z0%ksdCM%aSA+_%VTrDtzubgNo_6L|2=tCf^knR8=cLYQSvZQZOS+~~w1gyX9I;p4z zjvSI2$tai)k$Eucm;D9Dl1vA}ABiVdyb^wBVWw==Cx9nxNmJ!d0)g4unrdp#p!c%y zUf%;=llM}dtNv!1oE+7P&TX}aOW<9M`gpO7{JUmW4jW0XxMWrSR zU%j1o%miV)n5r$w%TN>l27SCdE?@%H&s-Mfc3sSodqCib4Br@io>D4YC}&OrJL~?73O1h| zDWsU@+PDMb%)F4a?GmG`X<#(c6FUdM6VI|@$|S1!W&c|no|(f4{*nDyhIZ3z;1V2p z_9W1ELn#he@W_H+tGG#kt zxDBnN3RaVu_V!XS^~H z9hzobeHcrgFfFy1RBS&WqumyhZFa+jf=J}CU?xawDM}*Ac7i4m3v^Ny;ErLwWj`s$ zA5xg*y3EC;tae!<`DO9qKc!kYdC{d()HGS1`hYWVTYjUo0}v1xwp)!-lhjcsWg%d! zs@pHN(wuX^Y3aD%3us%{5ib4mJ@Y;|;b@kei+TpAap4`ysBB{FV|7hvOYZ%zO45~o za&Kf&`--bQHj-zxI8fY^^li^%=rP28XCO4lS`Io2Ed|?apXx@tb_o|3Gsh^;l8(n* zN|o*=lzFxn~t7sXU;{uQsPn9|GwE4B;gRSs1<<@jW_qthqB z`GIlv^RiRMFS0=jB8>RxaMR70rj!j^o|CH~I35Jp_%~Y5w!_#|EbiiPA2zt_Uv5TBHs+ zVryu1Y0JcHgFF-s0CJf?X)^<;Ay3OKXBplPib{y4J<~Vj=uw$-f+>at&}U}hy0km0 zQA{E&LA7=9Qfl=r@|u470hS**^ac3scXrIaT( z3HF2sQ8Tw3x5r()aO8TtsLLu3)YxEt4m;fQxhJCpCbmSs>?#cU>D11uj%Tz$6-^AB z@@{`P^a&qg)vQhS{g<~i9@K*0DOlfDf`A7t;_{UuMAjJO)b|uojzM_a7+OF)2yU~9 z_Rx&G@WcjLDASV|{jXPf)^E3#sq~Q)3t5LvZyG^J{RYnn&)kQlZH#8-jz4PvBOS0r{`4#2(ypt>0$C(A?bf~$@Se*#a0_V{--7>~EgB78B z4=R0AtA2Zmr;(}0%E?()L+s4SIPQJ$nY2%9Yu*$xZc z97VQsapJRBmZ5tST%z?u z67+#P+}DMRf6);G_-pKv)M+ZpAi+kXKqy4)@#hMeFxjL7xHW}lcz!ZjJoN#TV$PIe zYkiXh@eQGum2aoW&;s7LNkg~vC?_mEHjhe#6#~LPz(7uFA8 zg5Xb}qns}!G}fFz9e=8&1CZnK%RKQXL7WVLgq~`O6x?7My-izYF29%rxUJ^}l>VHO z>2(-~?>W@^F5Hqnk9`uXBm;rw)WgQ)*9?b9Fz7h!aZT{a*}S(HS*KQqOC6kP`)&n3 zIVfxcAu6&S16b*Ze+pCkF9@vLKC=c-2-&2QZ^Vg^03M-CD;<7(I&?7D z60+3arJW;}*#yfS%|=~=*Wb1yQ!f`rEPS3SSt}`}O*13tjw@$QlZ!!(H1Y4uTjCE} zk1b94v;tEY==ZuW8D%~;!YV0r1PoTS1u&KAk@DJ3vfy<)wj~x4)tyXft6SHUFv67N zF0AO2!bGBuF8|z2iPC9fGI+hDDth{|JO3z9LYJ}qLD!c3V8&81D2Ek~`>}_cT#QKO zSa;F};C4sX?0I@hwe|e*k3tjfN5$GVUrv>0n=R+3`?tsnOoRYp zG;UiA2qX&!_PGgNfctx_;zHPca)gW?_s-zsm9l6~xXoMnNc~?P7_czvc4?#6g==UZ zR*`>VbTF5R>ar_-!ZDKPIa6G5;_}v`fTl_C_1k5V*1aj_dTd6T2ppQscBeu*vTng8 zog|WW)H(itZDM)Tjw2sI-BtnTPLPaK$H082{TkB40Z8u z1vcAlp}y*>gfY+Ic>6bX0qsReb&>t&cOYx~6o*Hq^bPXvm=8hzXA?*yRUmba&A408 zqSf~4I)c^u$IPowmnx0Nf^M>h@=`*Mm1*(yQbXJUMBgnwVjglO{i;v4!_kjEb8E{~ z9z2$jqXS)Zj$Y6}u||Y%ek7hL^C7&BFhy_8f48^};|T5?v3#O9gP)8(a7}Ub8w#cj zVJD`PfHr!c?_HS#&y!GSh4xVk+&Da!)y;b_&fJrO4z;u&<1+M7Pfe|d5?|;$f=tog zX?@BegY|6mF8HO`i-yl8!1xf}ecKY5IW<@Tr9O|y;EaEyes82n58r(Gi*~;#j^F^y zsPcMA2fjW1>VYNE6IFy~*!xAH;8J}VpYn#hpOe-2x9;OjHgR1XvKe~U_hwU^UDM6x z^9Z2=qHv(e5$!L)X6}lx6h&#f@KhCkn416q1PMU_f@&!zgeggsz=?lPB<@jBvTA0so-%I^Or0-Rtt0EG;j=!gI!4bS$$tH)04 zHk+p2HqaG&# zpBb>o)!35nRh zhhhx&L@R!$$!9XDseX_ks9U^=&I%SlG7Mq{m-^b2@V%=vMP`VVXxB!DagLbUu2b$; z^3!-hICqcYu-2?S>DH7ojpa@pT}7PePK-X<`)_31lIzwHrJ`^`);P!#JUv@j;SlP4o6fDCIj2s1sq zhk;FS%SXD{(SNH#JHc(sJ!%cZrwd6AVlUgUP{JjL#o*G{WDg^wlm^6id{EsFfI=i4 zc6dL*Cw;)NKi)ijcxnjSIc2KE5Wr&OW}FhXLYJ62=PJIHgStP3_iBoz-SX41nGiyB zj*^i27^)xo*s|_pOUsikGa&gZDYB2?h-CPc$Yqr;Zs2_5N0rp}`-2pkAJp~4&!7$b&_09Ey2dqpEy zO|A(}3P_s89!;H&;}$&nt|}lXQlfjm;99(jiZ{|@O)u;|ea>2rkCm9{1Mg+ri%zzC z$uOLMshw^;xVHI|Dor6Upnqcdnsk}dCngCY{+H@f2Fh!ZXMVhV$HPxVtL&nIn}wvl z?s(I&&YwVK;#Y6StJQ@I&);8SP#AEAH#Q6r^`Dh?0|Ps4l=Q4t=3#FRU8kKI%=C?0E@{e1i}Tiy@GCjQwc=|9LWX))6#_A zgu(El)+;*i0!8%&H99xE(6Lu-rtpRuboBu{(`#QB1_pyw3!!i1xN<`E;xo~T7MOs~3OoBVyCzW2>#7Bs2t5Lf>q z9-388ClF&CeChV$AhZfUXA&wzi^y`1vYJV!BqFAK!)wZ!FlpMb`Uq(16ub=p&_FN0 zBVw{e^*EumPlJ@st^C zjvCFZm>@DxS*1l47vg8U;-u`QB5Ke!g)7Y)Fw$9?F}3X?`n@5eRqt)qVQui7dxk(y zph#S~h@H#fG@4c+7P78e#7+u?R56IK|JO(YJEq(5rl)Go?`u!fmpDq02GwzMeY8~s z$*}No6;?i;pBLIG5L|>eNL=yG?QtrGOQlNZ{YN)eJnNBbWsw0ipkafXCAs>WcWk_- zoyX$=LSAcwP2Izuy>l-k17s$hg1qwy3fE)7< z+et@KM$?XtXoMY^_R&s`N&8Kiu87y{28QXX4`kyJps}YzDOn=#7usfJF3;z`;T4IS zUPUC%1>$&Oc#}!)Cy==9SHh_Psv`6|_kfShWs zGM9s|b7fKEpEZ?Ifa@`;B3!AXFw{@L5RP5hWD55)-Sw~tg=lnyGnRJP>C5o~7Ix%a z;sI(uU_QAVVl8Yq`N6Iu;-pHSmK~j|UsgvIlIU!dVuB4P!PJO3{92Xhx)#$ad;y^2 z0i1F+z%Ofhb6l?|>E_`VWyc^b&bA%Mhvjx=msq2aTum%+ZBA+%r5!xfyxjt{%|w9+ z;Y)%J#;(NfOY(4>C?oKrL;x#$YjV4NZgc8hQGfsd13*Cn3F?GXMAQHKsza-8CyDUG z?ihDVY{z0xjhXB~Sy&Wt4o*VOwvVfxm0Rw~VYyW-?qpXD6 zc?;tv7A?@DPMMyGKsb(;)FORjW@N2l+K17yg-Q*aq^1LTUq$vHgNp=B8}0riMGx4Q z4O21#+f1#}!-E$|AAtGKQySKK`rrR6xz|kDSZrrx&xe>!$+c}GLKup`B=c1;I36Sn zU$w0yS{QGKYRgp-wU1N#F~nF;@_>aeEM0aWNI(-Y*;QxN_GHY2+QS$@=q7ernTcD$ zxpHA}IG0$MAEI;R@}QMO@1C|~zA;q+?4>+$wPpIzZm|YXQ>qCBEg?Em9Pxy0 zN4a&w%+((CS5=(EiMz1rr;LdDEUYpZp3Esy#^2{7G>ZB`Q^^fEb%9XEvo|0#0%ip{ zOpwEt!I$rKaG|%Udg9t2uI|dLBh8$^YYU)aN<9CDg8h`8qTg=XLd(Lkx)CDlygeEzJNeYCNm69WIULXu1{N0|_se7bx_nMV#^nkv5-8hTb0 zhs18(P{$MfvI`7ncLparA4zk5ybpXLd<+LfvSb`IXZ;jt{anxupl&ie9w{kOFYPY; zhi##^y7}| zB86|i>lAb3fRG}`Q_~6#R~hq5^L0)7df}OKz0CCCLcm)`f_vpzHU9DA7-a_Z8IMEk zt}6MPe5rKCCYAARD$&?^r{*JvzPpLX9>Z{J0hw#)ISr7zVT&)8V$JGMCG|ZnOqdM_ zL#g(-Dj~qPT;_)ZxE#c{^@L)}QywNK{POBL9EC1oY{&or0arl*0(zko(KP@5x&4Zs zwda09D|Q!u7t4ZlAudgiNvAB!$~;>Eo?!jVWIKTcsOrjppc}T=NL(}|$4jqRyPpqE zxFuIq{6-Dk(TcO!F-DFpbP^}ccAMDh>jcn5l+apzSATS)s+#<<<7^w#sbDfKECgoL zU{`59A^q1}DD;WQ@5$3PbA%ldi!>B~3^^C_UMyT+2w$>r0|me@%kpfbwi9(`$AKZ-!a*IAe$NCc-p|UiP8@KjOZ18@nzLDmDKE+T%vp zuJ)~&0#=kx7>%apAM43k_Sk(?1PPNDRrgcuz_=Hkf7YD;h##4LyjY&j#DaV zXFS)(uB)Nm$Mgo=v#sRnU;qFBxj_MfdZ852H2Y3zm4r&}D1u*pM-(c$N>9Xqq%Z%1jzsDtVLNqr?$n%wJD1_Sm2Bx+Ax3Su5%O zJI<3w8Y~D12*zOVCBrzRz|U$B(Ij+q4Zk9zH%9YK)t60^P&qaY-hlTo@~v)St1z?- zyTdLco;%01t)e`UiY*LTRJyQThdEj1r{EpGK=>fCg*U)WY~Ep6lnL)J4<9c$O|Sp} z0dWDI$8>Mb0S`gL|7q5ZW_TL8+9dO5SeEySU#U+a?HV|$hFJAkEC|X@H1;lzBx^ zD)d94IG50}<{~GGT$dxFu+nUJNr}5x%wS0S_QjOaN-rN31TosL%hKJJ3`-KIA+&aP z1%)f>?}A^~rJTw0l0$^}yHi+Y!x*W_)2rc3!<1on8f+pH!+9?J2e+RMKT$ln-7&f! z?>Q z*+VYUu2PZ@z29M{H*gKMy*R5Qh&oT<$J<^(TsH;IP}z>ZKdtHBvWw=n!imf?gT-~f zs~1ul`9~=(LcEM!f9sI37OnA?|MJ@&maMn>ch+#&;UawF*p|o)N5J-XZ!t)&LR|k~ ztu?KK__Oy%q zSrOTRf8E@E^dJ-tQcm~75AoQya0^aco8&(lsW7mqAxHE&HOGQa&U>Az(_jvFDR&Wf zK+p=^SbZ~mFS2J$L-c$9zB*Y0niV8al;(fUbpFreU;Kfsampji=Kv!NO(sLG2QJfz zl#EtkY~Q>4y6^oYziE|I{YvciQ?_;rkPQ2t1@VaBi478P5yv8aS~dsvMG-ZmQ0>Gf zA14`J3r(5Y^->}G2gS8ax_zX*0!fP2)3ZVtP|p{Bu^*^ua%cLTz*)~ofHWy$G(vJc zs9oR4iEEo9xU>=XqrZQlrVvX?bY8X0mMwjhpFnlh+%d7zbv3a_ew> zD~oPD4;!=hc;OMGUz4ep8Tj18a+lsb0000$0Re)09nmxY*i$p;3E{IjayJGik~Lwo|3dRo%5+aFk43CFd7BW|HYQq4S38sj=hnK*z`t7|ijtSfK$9LTrI-hqGuNoocv zB^=o~h>5tsA?oI%30^&N} zsU<*P-BE|!MPrUr5|}7=kGm|v zB}nmskenOu8K~yv-dOymh-`jsq~t#Nd|aVJK&7)-8|jWyc0qWKg)-{E`N*b6mhuHo zsO<=M6TCV)ehgD5W8~Z}&WK)ww3o?m1#$-gEB4b4uD4oV+-oFWE3G9}n zL;v%BNX>32k%~_1#s3wrH-lZRIq`{NtQWxrbp2*R~FV zu7ic%cxPDM2R~H@b-o>p#;si@*kD8lC%-$qvM@BRf@0ut=9x=Psi;3I=Y3;Exo{b+ z?~v|Tu1lp_h&O6Ai83t=lOYPINsn`^KH0s!^L0EH{vNUIlc1KGK}^Nr29@7B|H1eq zuVtyt-@l-d7M!626&r+SZl`_g=Txs3dP*uRU_Y2qmfAlB@bmn}SYwd-SSd#Av|A#X z*9wflX!z4-Qsn=78l?yaMZy3e3J=r&I6*}<$iM*>liuf6J-P;0cS;6m*dQ04+ast3T>H$IZ$d=?sC)<0!9XyAjEzgB&F`i80gQv{ z0aA>DZl3!ZR{kQvUCSL2EGIZ!gtVCoBgHE+`QosVgE z5k<4CTA}phh6`!te>Af%y@$$8j0OuKh{1F#*ji*7>NF-mHrU*7D!1y^$r8e!I~W_J=F z7y!mWg%fF^PlY{$!LMz;FC_C)48wCok`sHrg~<{zFyENeCK>|yvg0PZ+P=s-x7et- zDn|}JM;vjYt_PO!B=gQ59`D^igVP8d6lf2?{>c5ft|%x^*U(Jv+Amc#CLyr)jk!vul*csBd;4m7hO0Q(nd@ymv9>q!VY!{2AxDXRCVyp!m1h5M?eK7D%DcS9|KKofp}PDsY+{^Wokw)B&zy%hA`D20so zl@jWV>v2ORP1K{*x-4~|#*ncB@#uE$%^kPOl?k39lVS#jcA=v`jL?{3;YW+RF7e>1 zy5toU)?vvuFEUt{ZPZBn4r$bqoIhT}x@l)LI@;J?@rl(-4tPoBrUxERLO}pl2 z2#}4|$Z%D+1!G_z&r698iw)xn_1dLz-xG1zebM%vCt!w%MFVlaQrwuB1@Eyg;I9^! zildmdfR5BjD!pvtUn2X>w9;SiNwP1twnpBMl_%~o42C4t)1}TM0m;nH1A0geCCg*6 z*e8<2QqLYYr4Sy6I-x6E{GI9P~_@R;xpo^L`9C&X2Eja^-INWwk_ z;5tmA0BuXXm0M!-(}JJZo1KvY>5Uq{6ZJ}G+uJfoWW*H)vL0&11Ofz$ZnmXtBByzE zZc^5#lLy^+)n_2_-DS9H64B*b2SoRzjQ-zZoQ>|oPerFP-Xi%KTQome!Y__Ms}tM@ z`1_m|=hd4=L_pt>{at5HnIcUq=B?8RZ&cq@z>gR9jbd?=aAP6Pi1dhFh&{JE#Lc3L z36`Fc-*1r{n1_JUxDk}IckNj_qr*}@3Pr})#qAjVya>au)ZEGNJpD&YyUgB>kob!E zL!Yh=cU6t!?>91=Fxf;H&{ShPS70uFEk^5H7+PwHLv-SK4B|q469@op|8fza2yjT* z!m&P(p*lPCRIfU_A^Usjl`}Sw$IZv8c^S!YL!zehDV64I-Wo+roP7OuAy=AG7N?w? zCauxc#~0mq&};!Ts(S>;;FeIxS*1113+oO_lHLP8z=GJ&Sz;8%N6sF~dbtHrtU7<03;FYb{tp-yY8}<sGJ=K+fZ|b1B!T4Ic{BDCmA7VTsDB;no79rCgD9NqM{-DbX#?^#D9oeA z&i$@7`cWKny$#;e$1bINgen#893bU?}`Y z`#wIRRjecJ+;T-SX3^?C0G-dSrN9o_a9ojZtVg6`VCp*wf7@Cbr9CLcWm z`%II2X}y-v>;M1^{6PWc^(;0A~NZm{DN>gac>JB+jtv5?!m zLsoxzBscN|KRQ9vDuK%aRm zgeFvm)VkBkdSsG8WwH~=SJltr<3YTzOh=4A0iOJg7%eE=Hln2i54APz);eZ_J(;R1 z$JCRqY4?rjRtQ0gUgmu6-Vph3;)4oK^3aIn+v&4&MFa)x$FH>aqu>=Wmyzv1WNBwARQ5LkyfT^yVIt|VeFU1Rha z@h5?2X-N>o(2~?{oNJU#4m+6OhLAVm=}T`eDyCB>Z_I{o@-nIqPgAS7*Qf@R&o=#f zj``fGCdAV(i%*Y^nt#tFs&lu6i$FBENib9dCfYO-&jC$8yB7E1(1? zQT}5r^dvxefPnpaP7rmCUxECm5#}-zcF`XYgT5OtjpA`{7JR~$?Qr3U&jJ33SM3; ztoI!z&|Al8WW#Gz&p%l@E{zM}Tv$rJ? z|D#ByFtwweqEGKnfJj5JaK*npf&g$g$a9a5`-~=-Sa&&!lypKIBR`y%4Y3_%H3M1V z-mU3&He6|tTEh!K2t60{3%4iIfX#s7*~ z;b+--Bj1%BQ_wpk=>@UTxYyI8m3q!#+s@=&v)M*7^sK2?Zd~w zOUD|l82SwR*IAies#6oIt2YJJ*0uN+_Zm7Z$dsDPmd;xxgm`F#H`@MJJlLbkoQ%^S zU$T{tfe^6&39_C&xb3{GHR=z-d}zO3xkpVN6Y5h?zy((8sS2}w^BUwF5DPI0xoxp< z-nn2sXe90RyyZbWOE~>SlIE*YFoL@C?b^8_Fqxg zSp4ry;lFXmSeGb{M9MS6$1?0Ra9uGZyP=sXu^_o0OefEmJsRgLWe0l||RW zFdzQ~ol*TrD(TQ`6(Q5zi*gFaC=H|@QItUAdO)I4RkFzNULxFE;z6iCRUEfyyLze|o6^G)%mFI=P z6R!5EH^ee*egnD+Yukl?BT3Rpge#E`V* z$=yQr2@df@R@1?dQAE@JwB~55lyg~+=8wwKw;x5Y1#z1n5TN3>mi%VZbn}~HZ!a?4 z+w^S}K-W}xn#&SD`7B#4;FGGf$*ks{k8^pe6b!F4Ic(7H&B;XFM?RGgUXIUCA4x*x zI=-2V_%Sbj%ldNzh}s&1Q)h;!Ngp6Vr*o{)XK?=@N#}UvgNL`ei;;fUueqD%Lza`= zIsa?_u6&#P8&h74RgxWsC=4||q&m!yzFVAY7UA-u+KP%v8_r7}Jr0vVW@;DS>xE2w zjbXcxOV^f*jfONjF=dt_Z2q6IFqR=DROl_l~RN7X*HR>Zhn&djhzS5 z@1|WN{2XEKlDAq@09ev4rLt3glok&k^TJ>+nap>Y42nE}E&Xm_p10gCwo0@Cy+MGtjUVG;yPoV~=6zJ4B0# zXX5eCpAxm_cz0HR&^&v4Js2E-;w&e%#F*MV71=;fqpO9taoOc*`Ad>)$>?n-J=#MZ z>*{$BZ9TF+SwOorS>|zk+~n+6>SV3r7-SgAn+P?Xd&-zaO?Oy6G>k1@w9^KLDC9(F zzZadQBPmUxvrM#C+Gi3gKQI)zpKf6i>74#k{6fYy#8J?Bv?KX0cOEKw7Mr`L1aJ1kSF*)p2f4zrEeq8fzBNjvO zfT&GQ;mr95bbrNmPCCxPz+!x3P%uQU*NG;V9p(>J7nkC3K|U^$<)OzeCI&2h0^6d@ zfK(9I5@v=D-Md0ZDQ446W27%yz`~=OD-A>IvWijN^lE17SUpq>Ojj!1`F>`4+gDv1 zA`A`!AFeb~mUlbt3eZ?7%eld}tqIHWR#oz8E=^QeWo0u~SH|O;f@T_JOHCLs>IHg> zk`ZtcX6&>Y*-*+5GYonundg?3-G+e$UK8+J+8hYISX?zuQ$4e3jcq@IHa__D`tlu|vp9Q3pB zA=37yB#}u^J~Ihk?#07Vo?3DH`CCoI!cn6MqX?Z$UGe}Ar(C}f5+fyzYhu^ym9lmH zFE=uGhrc0%^h#ROFZrsEP%tl&PEiLOshi<;*@D3_ z(~%jCCom|9Ew&0GrwzYtm&pGu%J8&Ke9Lb2)wvFvjUu%mR>c$F+#b%uxc^NtlZ+F_ zoWmETH!lEyN_pz7x=871{g*2nSefflY3p0*hopHscR!!*gY*`SRMF0D1Gs{x?}Ly( z>dW8l`34kV7UEzf!!Ewk)wv`|iNjm3)cpo7x4RfE)jL0vzO>9188*BLV}uM@82c00 zz#<%ksvLWa?=N)l(N>~wK{v}5+E5#Y+N=GX34T-HE^Mw*NBtKLz00kNZyPX+SH;yblrP?(+kh#I2cLO3MAwh*wk zM%qnZkglwA86I(jy$w+-yDF)UG82X&jU8$dsB*XEFXv9d`tSA{K^FCK_IsXxw@_6b zPkq7qi=sZDHUUgkCO4T#DD$~xn*fuYAwqRvt3{Z}3|Zla@?guGn0fwH#q01M2gED^ zHG$I}@Qg|Q-kI_AfBP#yb@EDMrgh!BKjT$*Ma1S0LmFk zwK9TXxV&NKk0Zlsr-iFZbB7qrXeH>)zQtL$}LQ&+79_VOsfTZ27`* zFEae-Ct=n1QqeWcg1^RwxL-7;{mx~fk?Y+B$nJWjhwB`$G30yIo_Z8)s#Sor02t&7 z_{U`_8{<+YwWO!irIwIiJNkqjvjX=R76!bgLRS$52VT zaJwoWhRA`mr~m*6_CWyxYdI#N1)^mzC4U0BF#P$(FB$hAaP@-W%u+^02i_2req>sJ)X5vZhx7SXgORMLp<5%KE@F#+@CGIAT@^45yP47ZCx-HR zN8j{T4IZ-B27~>mLn)ogb!qbZdHR6PE}$fR7j*OHR+;2&|UvhVlGlRuSf-kWiRKM4Nce>mDuauQxL&I`O7_}K}XrAxp zP~1EQe2T%IGopk&1yLBjcNCf;j6@UAzeZVD|8q+51LU6RjyL9>_&Awyz{9er#g7d9 zft{D)k2t4($)k6S5kxK4SIscmq%BsDQ)70UmxZCsd~D|jmXJ;Uz{acG*aBZ{S7SOx zz3v~);HY=0-5OgIH}M>)&S%+qQ6z;vw8j*;2P_qe_fxrsP#X|L^rR?Hd1pJA^3jk7 z>so*F7lSJAk3#e=+Rime`m-6}IC`EY66^Qk)|qCdS2oXgZKtj)L(5Gi-uk@93mx`K zXz@4P-7p%&D-#J@jImO?|9LqIbELO-3tT*TzrpV#3P$*|h|lx*0W%qxRH-wBx(4~% z=TH-Oy6JeDZI0R5AZ9WtR^V4P~u8jBtz zj?q&zNeFUd?&Sx|JW5PQK)>jj(o|bw5RX118ffS?OQs09p!U)Lvm|O{X)hMfV)vk4_Gcsytf%R$l1U3U#9v zLB}dJvewoeM}>^rByLpSkEe}6ugg5=oipANGz(|+TkCWdWFZ_g0o>GcwC`d763HJ zWDc3s@khlkK{^QHcbF%(-8$iH^4LH7M0N{f(2Pe+)3Ef$XF*lL1d^+5JBTQ9Wvt&RF{ zU6OyR=O}3bR;2gHO$zydqu3xzcyEV9uaZw0X$qN%PJW8HY>S7TFb}fRDSE-xB@Cjv zUap&KS1atk(-fWyi{lK~<&?4zMZNcDNPf%xEJ@Y#6wT$ZZ>SBq2Iv5r;BjIvk?UHn zcX8hBCKwvJyC3)t<-#-|V-5VmlVwN#T2Of0AMXbsi=862cbhY6Scu7AWjw6dNhbhr zI1h*haSO+y`lshPhE(e8n#7t*5(sPaDv@QX4Ck<&B@hev1B(Gqb3W=kYRLnQVH%Tx zz^)#UVafD2X>IZ%qz{@N9gZ)oiP3<_WU4^3`Y3nN3&B=@`_=;SA*r*J4|nJt?a;+? z+ts1s?$2M-uRHV%vz^4?>1OGU9={ws*W9QAY)i6?M12jms&-6%Yg>NWvF+Iv^%d>- zwwrKxhRs&}LYCIb^DJHTm4fT?N@JVmV?yF}^Py}e^}+=Ihm;P$&_F3Gnd)jG(H#I+ za~o|k?+Hp-8R&>pU_1*rD=PVUbX211(o(xvK4fFSO>(k|IIeu zsZsG0Tm;F#Mz>9~#3`((P|d&-o=qjNRNtKR9%8oPLrSw%j$CgXQ(_k8Os1b9@}qwQ6V_b7I?n_qB#e zYu0ghM9NLXkKkV`TAQ`KSzMxDJupT|4wninPEO<%yc1Ytq0E(fY{;Ml{?37B`GP$8 zZ@eBJhm!^W4aRo^dqT;tX|JeyR7-C6uh-=;=EaZ7R!e*2JZ3+7WhFeL68q;Bl(n7P-(9HftY)1$i>@u@-5Z&opNLwg z87K5!YI(edVg~TUz*ir;LnYTtu04%G*`?UI8#8AuHEC&8%hkDY&w&^AW3BGn*F3|G z8>IHgn8pl3^VO6E*R9y}-nQrMkf@&KJkW%Vz@4G8%Ly+)QTV^6 z6jSx^XYU<-iI2YjG2)syM)+exjnpGUCC^hmfQYzX$ucqc3<&5^LrDCD_jfyt&a{L#t%nb-Rr!IxoDYYh@pK;lt8 zl)Qi`eZGcltHqNXv5!cXS3&h6i_MBOmgsgunk2Sx)=YS=O*D?a5&PjHK{25GWQY1e38-3cX6`sAQ}QE6{UeoN zQEHAAbseg-PfwM6p?E3j2Vubt=&8E^1kExJ8RG+OiXGy;vPu-wm>fm(huZ!wM?D7p zxU$D1f^|)@qRKz`oK9!VXbQ_;ly>A}(EUig+MtzKp8nTYSm-1GgKo!=8Upu4LIn-D zpdW;G3%7Vvx9tSRi(ex!kp=2ZT-G1cmfiJ%p6QU9H2$RTqnq&Og6f~p7O($jK7`Pm zww^c78-M+MliU$*)VAe_LF|7yey0G;*dXXC~=b@zEQ`_yxI&FZ8e#9>{S z;a;eqe+E5TqWb;RxjT}|LM25V#;+s^`Ywd3th)S%zTcNT4lTq zw(_`*mBleHyB2uDlg1!eUa<_#z%wpFslj=6<^^#A$=evE1gyf$1|uNqR|eHfX4gOEStWoCNqziW_C#ayfJDSWY;Ev$KZ$*>YPqdE%>Nnay%i z_j!{5%JGLkv%Dnv#0g_yo^%-_8YDFQaAt0fEav*1a!IX+ly=K4kZ=G110X@3>_rhQ zuh~QdE6Jsy+IW~=DT$_coGy4~t+NsyCp3j5NTc}A#Rmmx6!ob<*<_1OwIi|VKAS|? zv>I2&K&y>yf-Hn9H)QsIRU6M^9*d~q!2_HOF(s0SKO-7UNnAGSn5JPKCg{h#p69e? z0tw)$$Z3+b&9dc%f;su{EBhqMj92U=$dREqh#Pi%#zf7Gqgk`x73*XUs&K??sRp2S z4eJ*MkdEVdkOmQa$nwSwlGGzaHev(WzcDf1QVX0W+;?;CRvJ2do$li;Lc9e->DR7Z zh@JWM>wwAz9EqzoE@Ljf=h%i{xBgyL&cky|s50XSSyq7?NN-<}MPET3Kd7QrL&8NR z7qFbioB*&tm0Ul)s)-$WUDDc#`1j;0!U|&HXBr!tT-Q2%2WoWQXTYCKXnj)dEnoiV zde+eVxcg6*_JddK{)5uP+I9J5W2bjd<-y_y4aHe@0%y$tEIUV9j23&XEKV4tjHot= zFSsfb3|cUw(n)(82_O8a?X$n`a<#cq!7t-A>fAz?6xBiHh<5LKu*B${{{p4!9aL-1 zT)jJBvYA2ooMV3VR*j6mhRxwIrHp{x>p=##stSg^2qX54g*-kOdI!!L4m%d-sA;rm zFtaRbdr#KdVGyUIcR*oyt{n4+oAW2p=FOGIkpiE9W-Q_hNeolPv-;G5B9JtauK^DR z6PA*oeoNT#;j>HFoORAw~g zH-CJ03a?ZGmWBEFX zrp)U4FwdIFO;G|w*CG8exZh|?-uRvrFBH{BZr@nEgW>1wcBat9EJD#E4YmAMl0h+ssPbL>Syjh?Qs zjyf8}&o~*a%Ks5Iz4ZBT-+y0Z5guUtKIc=tsgEzLk7MOKIw)Qd1!)A`AIJF>>bQ>U zbk%<`@K0PkrB9IVs5VQmYv*tB#DRPvpDpa}sthpwz2);n9h*_ZSce%))UDD;#+N?e zOHZBYVXe@fqWcI+o*eT$+sLa>kXz$aTfMVM#J)yfVQU!3!iZZRH!(_zqcE=1X$s$} z#UYR`4(``?JdHo<_uBQ-D@}3E_5*+M36_LQkpM{3G{4Z|Lk<>31phHyKFuVba0!94 z$L{b6qKwQ%+KHu3uOTnJML>ZrN{A}!NC$I85~P0!b^zTxm!^xH`ip71tpsNK{qYZ1 zU`{*Sr_Sa!1beVFyv{d}$*84hS@9QtDMjze)I)UjYr#XZ^$gz#RdsAKyq4SPAHhP7 z=a;6KK%`|gmLk9_P`>kj{wbhN8We!??>Yx*-|ePm!a8Lb$zZmy-C|DV{CN!E0>&+9)mQ!CRnUjs))ePN@;F&GL zkGV*LIn~#4bT2{)tO^JJsCtrSRbv-I+DX%7mEOb??#`u;-r;SnT?I4(X>wPvUS}aJ zs(3U=KW2b0oEia~dQbg?^*(<)HS5T|`}RM_iq&WhOQg$uny!EvksS){@ET7qzh&8!2ep`P?EuaqOmX4^l9?u?48S;U%STdTIK-abdS-4we#igg~<^`fZ5ahag z*2^2`gm@+|kZ41Rm@r0|7>|YAuCFBl+1oQ49|}~?_-6}K)DFi4DpV=5Z}M7GvJolH z2s6VANW@q#wD?RoGxXVK6zlt1BjY|Q zJ@6(O%VcpF)uxvc$3sH$J1HBs(S!^eCjw^z$@uI}!VKfNRh-tkw6kz&WoCSsx!B3A z|2Yp~0nS>OtPTiOs!~U?<@wrSi8;<5yfE2fF-5*c;KiPg`YDl9D9VDv^5)s~vac%? z-U6y8A`gugh{I=`YBvLD!&^TCIrhvRhVlXhf@U5{{O>{`b~`!J9H%vL+7$MZv;M1(;z0odd$ANmvOo0!wgMUxEDtQ3xDMS!*TT(^c^=`Clw-s{+>7JldIbqT zM@7+C=hUQ4`dIb8BduQqv}vp_FA{QXspG#v_R56{HbMaV96cxp5jemvKRyKjO;m{i zfGC@mnd>JfeG5SJ4@~vP^@}~17m`1V&=V|TK(yegoev-)0W501Nf4!7(-=`mO?@c! z!XFHyS`4moBPBB6PPfEA;VLR^cBNG$e6M()8hYL*RB58>iz*Q2+n^_M3K(W1 zR8MJES=kr@MS#r>hZsMv(}JOb%Kixryy3ht25qw{+4@Hi&x0g-o#o;H#30kc!iisO zoQR2PLp%}#YZ{ci>3R6#$K;19Tfo#T zV6lUr@c2E2M~G+??ms6v=#AXGqt;lh>hdxQUzX6LltP9Hfm;;Tg?7(T6cP0Ajx}p2 z;^fPpnr9$ZVBx0k;T%LRc>@0-IgFwTfxulhQwu_~$1xp>aYfG~Mu{JueKwqk`7~VG z4(!9_(IW1%c8&wAI{#1z21{fc+ekF|lB3G6zpf;#KtKs}ToScd?(c{8=}f1!0jNvY zZ52g%%k7J8hff@z!zPB~V88t{gHdb%009I+0fKw66hyTD!YRu!kxElCzwsEmHIy?} z+4fSef&(`4I4CgkDS?;@v)nW)QiuW&E!C$Ykk|gC@XpRs7009#Lp9^$v?p1o&Sb6|D$X7PnO3UcR zkbh!wgW+`DHy8Z%3vUrnvH`%@SVdEI(G}=x6@+5vpQ`o||8X(I5?rT$p*p6S-5L=U zRGh#&m84G&ZX6a0<~|N3g8dab!&3%>wNamqw<=4lSb2>KR`lT;QZ^= zPk%#o#6OlT@xbpS4OSub_NmzC=t5XjlUHT_bH8(tAua;QfOw{Ty4mH2`UwC40yF^v z3GlZ>L;v{OI`plvcXi!J=O&rU&2SM7UYb|53V0R&2Q{927U7(1ZDS{-!qE}SKRAH| z4A>|JhK&UcL~pfp#Hs4TR_GO7wA}f!q-p}TYG+q??4^>YNjR)+mq$(-oAFFcU^S$_ zP)$aAr)G@Jh+)0u>qa3z$<;N3t`(zbuapvM1$8n}_1^FUFl3ojcpBf&i0Ue3T4KFF z#mj(>MgNWvki`%LM3`%ClgXA@UJMgtQdX#xj=MT=yC>t*Nhv#PN@eeGa_=u2&jYo*Og{Gm<=9FNArt^S=Qe$pvO~iImjPV zRY5+bB(#XiY5}LVDi=LoiCg3+vLk(88sFiac{d|$h0~x)$3km71x*a(f;dMRCcJ&m z@}daLnB%u=$}u96UlO)o*fN=mU_HmH;_!d_|KV+hy!w%SO?2C8&}x!%HQ#)|uM~FB zw94+*_DGIrU|qe4D^#GiIXbn1TJtYgHS3zs+p?|GZVUKV88A^~kTZs&H)M?@BzVJ} zYn_SMcT8gxzV~*1HyK0!+Yu#u`_2tqg9CpNzeH9?>e@2cdCPC=JAnWI05Smq0(>pe z5dZgXyA#{4s!>e&n9-m9Ei{%y-xPd~{7e#bk#)*=@jo!2iaPi+&m1}-0000|0Re)1 zEzvXo^t1H?wk-2^04QRUluH&qizTnF#h9;XB2R}vpum<;!*zvIH7w%m!z6`JwF1-E zSoCfhmbMA5z>jS4{u_0-FJ!4Qg-9?pR0yX9wlq#e-^#bHw{L zcJNTp0gN~fH;Dr_P1gJ8pmHGM*-B6Mb+r}JiE!SGL<06|11j|#d2HJcDm|4DnmyEm zH)Ng%>0D-{*rK@xj*q(OWCq~ZTR?`8@rnG+Rue@)$ats0Z_8F#j$dPSw>#v0tt~)t zb96)Av>(rAuA23saL<850Rm2WoX?X;g{%n6i-xlb@RaTwdM*yD!LmqAA(&6i7 z;pm53c`#0yR=2=&-Ta>=^mv-Ji)?QT%7RdPU)8D?+XD|m6ep(-yO4c&Nz`yLwN*5fd5$kB5fXl*M)xw#W-~NE6+&iZn0|$&&*)9$YG`GZ2yQ z(wDH-*A>$(4Q!f}MFwG!aIZX^K-b=fOZM_QMsk>+Viyd>67J@0i}D+{%n{K5jgbBt zR{8HgO31wc#$O0|3oR+rl$R+7JoVV~Axddv-tTsi>%KBqIq+JsqJF1aq%Y?*>*Z7P zM+-|4c$6TbFz`*m?q{C44Zm#+kpD{T@~`f6e*uin5<0!iTrFOIVolOR;b&B8(PZ#h zi#JKEB)r}e-~idDRh+KV)EKke$3sYGqI?tD#iDSlxMu^5HSP7yoxj?<(+dwJh6QX_ z0Y0nxA9H+3%#qd}r2TW*8u9@~bqbc|Ga#EQC~Xg}rM2be zrMXRe#C$U+c!%(&%tSr{24!9A#-OOk){kB^wg7C#FN2?NeQu#i>0I9}>ngj?$*oyb zIRwct!?HiRmKrOXjm7d7jzMX1X7ivdNonkDy9jC==VyfLP=nbJU*4r809Fnz;&|g#Sc58y<~$KV`tiYRvrmCWXXI<|P_e|5<81C0&YS)_7HFT_RWL(|iwToy@Tm(*@^XlG zeAsLU89uUfQr_-${XHI-1d#;>m$&4N=kYP-(ICizP`#gxL79#sql#$XL71IeycHW= znr2q%9F)h+eT3&G+JI%|kDw{*md-^Nq|qO1zBnr*3yR1j=s{PraGZ?0eMHDhb(lZ* zeKXePD;#g2n@}b5C-0b{2w^DkZwy>$HiAk!=G#{9`5-887AwN?s!ZgG`ahEJ35QJH zy+oP+uB6CucHq3&%CEKV0p{t}91o-PmoHmJ(PK#NDVJqH1KP3K%kRI~aKt5_977Qn zLqQSgxeKqw1zx620nbS!+B|=_mZ&C)c=^sysl-6}!(`~&=iGBIByRzlYQ#I{G6O0w5R@2ukG(6uISenxc(#(Q%c z#y0~KAzp6Dbk4uA^lp0mUN8vdAiPyrFLaiPMr;H+-ftZW)xlqYfA=%L;pL3BFdM%> zcg`yT!SDK^Yr_gjw2RV2K~^%fhs!r)ePUw@Fok9N1CAFD!HES?rdA52g5qB^{dH3F z@I0qQ$2!PxawbbT^(=O2H>SWrq|ui6*2hCGXvmDJ2)8V8(hrahj4RIZjM`q8+lg90 zn#bwykR-*>8EW3l!)hc#O0P3d--QH)Ek=5~{+0zE{>EHD=n7^b{rY*<88Q%QKOw!* zN4SS`1P+pyMm$P`FP(-xh?7Is7+MSPF}zonP8?OO3K@yvkzsjKIC$!8>{R1bb|3pn z?tm#y`yP)7KQe>@~I5e}&2DPn?yCqY-3EfJj>EU3oshfhS5 zco9E~4T`y@bgNmBi8O}Wd5rMju8j~=LmX>e3_`P2G>WUX@n!`Tj}wVg#6i`pAT)%K z=lH+d;-5U}0#0O_`$GGElPHw_tSt2-;!}n1{j=(rp{q08#cnY*9Me!e>h12vg8g9d zjUSnBJt0tEj0A6`q6?OE{QKIQ;?tl=rq#7+-OEpauJ}37CC5Gn-{m#!*)s>Z3++vq z_&b8yMSt-oexY@5M#&nF4`%DqHyHinZ+ouDAul z5dVNz(uP5ls&R1t^g&IsQD`%9itK9`!8a<*75?0RG8EfIdOFbkYivvKYn z)0}1BAC9T>LueQk zdsY^~$g5o&{IgBUxrb{;W~40ExV9}TZ9 z9E0p?*evkQXzZV%Fmyy>Gxt?-CIK`1SU@JvY;CKmk2zU@$AD_jdX5o{a3IfL~`8L9CPWd+z#9d2N^AQ7cLJW*?5Z9ku&sOF0J zH#MplaFDbvSa}+%=0sRGi4O9q6pz3ta}|cM)*)K)4dUDViZOBcYN*lFOY-y=Q<+`5 ziERUhq$e`E$|7blB}6PNbQp z;T0mg>#F-|4g**)vY55#9;GW7SUIdTRBEE>x9SjEn~K@J!h`nWec~PK3HtIc9mB1{ ziv9@qZNE>&DATac%p|f9A9agJKy;k!b5_bb4f#_i$*>{x=1@yi|O|2XucDbI}(O!Y{WC$DE+EYNGIvOcHpBx zS=P~RyxuoSBX^RvYN-_K`(^FsYxoj$xb69@Nr}W61Q4AEu?QY@;=)Hrpoj{bfXfg0 zCor*>z(szNA-XHVKsc77WPopj!XrjG{6}eVa}+}XJ14!3K-yI$@xlaY((U0}DTInV z1eLCIamfSf*i|_bt?;R~nfJ+%S^GS?nXig-lBIwV=0c{%fmWx^OY_KA^NDAjmR zE2s10^&c#>(*EYku~=EA^R7R{4J)4_sS&lcgHl-kNCKpSuG72*SckRB6 z#U}Z@*(<*JNL2BG$I~(2lzC6uNB>3}vSgP&UxDVs2~FDtBzrA4>*AJ%k;J@akrynt zl~nHreM`xq4}UO?=eLQ9vv_c!I`H}}O_W~z9%vlze_*g$yl)*W%P{SZHkUfi5wwg~ zpijQv1+mU@+h-BDf{5M0V020e>wxA-D%2=tq9nq2W9lR9jZyx4~u_M>o+_Dn&V{`_agS|`qNKL{lFuAE8yQgVLr|S|G zH=8TkP#fP8bBP0M(X8lesDSnP(MN7?BYf}9s~mim&DP5w(ycqyu-=fp@4!|I2prqx z@Wqowr*}liN8w~&t@dQL}WHhLrji^~aQ zQRKr`iOpCI0`6?2qk7a*hNIj@57v~y2>!6&R*=|*-|uQH?PUV~b@6C`p?ZVUhex(+ zk9VU*UAtmndpnn}SZ!5n{@7P*r@jEYq3->~tCU(wvT8q=d_wKh`#P0;f)7$BI4L`j zP)f;YRRJ0n+U&F+WjC8rw%#jaLnn`oksZ&}e3(<5I?PRX_)x#}M_8)6x8{8V$?RKf zrfO>q-tXQbrdb{ZT<50Mv>8k$cA_2_j>ZEuay)#k&tW`NhpKhU5I2P)iwCLJMt;Pb z%D_akkvRyeC86>?8pA*aB!40QMX}+a6aII?1jCT1VnVCO{Zhrw%mJN{U#hRqWk?5S z(C}=uZ?ujnw;A+x$KfBQDBDk6At~JyX}XygsY&C5a8|mPTwQ4RnCu7w4};W>u8V(W z8yMjxcV9Tc<3$7V95_@XcK)&vzgH1VU`H4VWYRj@GN$E%Nx^p5plp|51uH4%;OitujCGCWsVhYIee`dI0q>)bQ)0{ zvYSck2ZgJpE5nJ>2H(LaCL}*ZoO23N8{%D&+Rw$P5<2h@jOf@U{%IX_WRpZ7p$kv4NR=Hph7jH?TMK58sT6KkbItr|P zpi$#;<4Zn~1pH?X6{A~y$uLtIzd;!jr#cpe2rFeQ>soeL(i132j7986hU7S^l_DlQ zW|c#C7GhV&vtZ($v_`cYxN7CB_c0(SX# z1!Iiukk7@!XT)~NZkCwQ1^r1@pRys`+@av6j9iy}m4+tAfAGGnA_meNv0UOy4p90T zlA9NjU9kvAeRGi@2J94Bzm3KI^q6Pu(phT4(s!_n_G)dcBk|b+jr}o5-NtimHPtLc z(<# z>_+sZF_%gmPA&X$X-qEy)+Ba5mltwgdxYqLQ^eqJU^kF-V^S11Xb_aL2MN^r8ANZ@ zX?s7>xByc4Ab>u4s(E2r-Xdfp&qK(xa^aapTDf@AE0BBkpM?w6FV6yj`&=jk}ZP_ zd_fGGtB`zgXps{yf;R=9lmNiVK0F@t)l@>naC&|Q+Kpe;QfaTb~aJgO*F}6lm z++}IFG@!}W{l;kAT^E3PX#Yq&Q75h* zSV};G%3plD8U%4fQN|E7#^MD*sUt-Cj-=ZoS5A;<&9Jw)Kp5wV6;64ALKu5iAR(5E z`nSDmsp|t>sqF~5v?+vED8h&@>gyO1U8wlAM>?4K#%z$bx&70$qUkB6FkZJ4`zjFL zR>dWPl<$i?9QsEI0#tvl@IX{NadmU6IaRC<*VR-8qSyPX`g*oVP2>@jqe4!4pa-1Z zIhg0AEK+P3>!vpcByi5B*DFIn%v;HQvsuAU6KAmQsevah&3>~l$kv_I!G2vh|8EIw zvmo4~G%Y^L;N*}-N?+H7_Bvz%k#!4gf&$>srs@2PmgTkMY(vUM{l;oz9Lr=3r8W{HxA>+s`YoHgfJgqq(ZkVC) zG^o0g@@Dt;reuBqrq#?3&b*w$iLU#>Yq~;3DeIKNFo&mm@k$0ZG=OG;>)N#_3 z9N#at*ObauqjS*Xydoe$dgS!a;RrEDu^m`6p3=XqCCjpJpW$A3`T~lc`8o$J%CE4i z&U@44NUS-tRICKk@D_>5CiohC0u(gbA{Y92vs*JyobPUm&o35Rs22KqT3UyYO)xA( zy)88XUP$N@>Z@x3-6TNyp1&>&hrK`sml)-2z|dbaRpn0K2R~86q+_({aG}x}z61J( z;O*(isjgz#wyz*t;!XYN>&Z)v+7+Qr2XahP7nFe^L962v{rCc(Q761Eq|56Vz)NcN z>|cgzxvO(lBsCNU_yU<>vuD03f4NVf0t`=&39LcFr^{8pI|dGmSXg*L(2|?;-oFPx zSm6BI`eVx+(wds37Xtqna?{4IuXc-32D-#!e{I1Ki!lmE*SN(5{o zQymX6VpH5>a>%*wNg~>TM$U>ilMc+7R+7j3tmDW)JSg}?=w2-oWK;ldLs6Y1lw+x~ z!3yx-Kb+bZ`$iqQo1@ueGGY`SE8$?|cgXKZZSSW-hp&u=6GQ+KaA7&zm6{uyd@R5M2 zV_5gGgHc1MIby7kS=;Lb6hot_!C$4+H1}9LO+1Q;OUC%ou{w&VXeGGnJzS zdycT37`ku}^h9>r2P3`}$0FHj4YF|bb;oR$t_onZ8Xkg^1^&-a!8Hib)p95o>H?Zs z=X$9RR1YSG%SniGSTG`~_r`E7g{w90aYH;_^fKT7nj3h-sb{iqz^1s!n_8eX*}%WvNA0!!uNj=+!N_Hmu(9jav_`wWid z*k6>RB(XFJjB0!(?TZVyTDNw^nNtd8Z}`+eU~FgXQ)S>xvjt4dmm9pJD^1j1xxJh& zRodaO2Nct?KVo9b3G93f7`b2!uuPi08SAeMM$2K>KwNIYZ)^?EFW+J<`Rg(^nlO<1 z`OR+P>`IfA9Qg*XZJk^&6k8zT5XWgj4AL32mo2qef)-d2pvI|(;o|9N-DDIPhs;rA z-QC4gc$;VE$A7&Xz|p3ARzyR;oW>3Pu0ZF%FeXb(F3`;TdVlSb@Nyxv65?Dmf8BsBk>AXCcyLrI=1?HP zX8}b_JdyEh{*tlo9>g?ybHZah&Q_99Y9pb^%}ZiI0KY^gBdrMcdDTmcSe-V>7cp2d zc0QU<0J}F3_fkrPEvqqEH5swdIh@B;<0|Us8e81(dZ+4ZkF7t&z&hklF9Hvv*=7(` z9)0x7Y^;DT^}tZ)T2H%R0di}7du%a&aC{<|)-# zG~m<|Ql^4SFr6k3=c~apJ7a`DFdopCQmx9{jA2Rlg+Lg|6RT&vUki8jRxCRK$W(RL zXI(!f)j`*K&Yl0=jj+ZhOU+|JCEOECT1Juhi-K2T9huvW@759}7N4BH`WX|5Xg9+S zzO(bn;;3+>l)B(d7W(~jKW}Cd995MxlMc^ZoeqB$$I(T3#R7$o)caCy;%e(?Db{(vDhV@#g+T-??nX!4$W?wdmd!$oywsy&Aj z|K`Zd)Q$V~sh$7T*@KwK2SU*Jj*Y5NhhZ$q+;~2-HTH7B>8D+}t)hfmO@C@L8c4Cu zk6^PiqF0xfg?vedU$k;NlL$_G)`IXSWw*b^iOH{i3wP^5f;7|9muJ?s%<(KP@m`bG3jczzAzSnGX)XKLMIa(&T{P7Bhnas045y^>8V zhV669!|~K?z?SJFt0qlHl0tt9d$s`Q-Raja89pnsRaSWi6l9rE{UY7L_))AJffA8V zAo7$JNc9`K;Fqmg?POwiHJogM)UZ9)yfB)^rq^_(%63aDc(;m2#jPGJZXbhuYw=mfWnzW; z(*yu!T>Vv5g|9^;*OlX{v@9hjdrg;SFFvrV;7q{vr1;oN^~J5<4hA8^PnDjXfMAj~eujuP+{H3AYV3NQ0 zEA)SF;!0ShwJhqPbu@gFqKUh)m@HEgSnUG|&Da1qNe;=m%BKESoXyi==mvBn-x7rY zxLGllc}U@vmG(;0H1tUt8c3_tN_)tQI=K8U&Nhh%4S&PNpm_WVc{|x_Q%Au1hoMWS(yq|urxV+PgbJ*Jq%Kr;{2UoqAP#3zn^dxS0J89^!b z?4ri}UjyTEaQ^^lqn;{N6#wxyLz%qJ!hT91qanLD5;Y81N_+H{<6iBp3Gp>*=v}8C zjpE|8qHPEa#l}T`(Ss1XucA$LLA0t*P$w^l5Q)r^5GO4xI_u0sw2d^ThkhRSM6=P{ zedY8KZk(TER-O;TwmW6hTur)W-D`KPMFGTU?QY;RL60RN(v5K&!9DF~;y9b!kxWDf zQ_f*Rck^Y1!oYjd)6IM?wP%wsGI(6!NWHTaim`p&Ds?-WJGAj1l?g0#{vI)Fo91{P z&Le%&R$9N*9hlt+Yz&MsOsu=Vy{l&)P16Ug^bAGMsKhtJKpk9|a$7BCSaKRmh6j zAa>rD85?Bc3l9GTUpaIry!K0N%Y}bxn7bqb0Cn1y+HjwDa{9F8k8jme`W0hOT&Z65 z3!~3HPTWcTtO!YlPEH0KESwE1lauHg)CPyy(K>;2I6X2jEyHe)@lZVUmfN-751xx4 zICf|*XUyzCHTYY3Q-wj`1r4OYgRdR=Q{54T!!dt=2%C)O6&`8O8PHp^qmUSy(G+5M zf$*k>yaB$Jk7y0Yrk&q}`oMZlBk->~~#Oj<&+UKtS|%e#T4pqWKphmp`})N1I})#%zN|InLo#&k0D^!Ptk?1`td(u8^2~t5=o?kj)bvFF0c~Q&sk48WZO|HE!Jy=zIiYboT z=*{m}^lC;JCY$X>rsm+>wa~nKk66B_b1Rw9j95DP^r%DqhpucbvcPJDNVEKPDETc8 zR2A8j%}K0c`yFa?Z7g!eI|3y+m>9?1yT&P7ZQ3d#L}JL+)EIUN4=N zpynNJiWnC+QO@B`I!?1}=^iort{}`!72Vw^kqd&k%|Tt zOFuB7d?SbHF$wZr`xd|JS|5!_BCj%=!ux?u2}XhOlL6wMx>d5&Ub@{jC$a_r_M#18 zFmwDC%F=8P(RLTCuBRTZ)Hq#{oU2Z!Z&oz>*GPyPL3x*`r&ev!cEqT#;SfMHVqmxl5cC_uLPZEqyvtiMn2e7h;G9!^Dc&JzkLzmbU%K=#9KwR% z`mv4p3mCQw<}rw(($sww-@6}97V52+BD?eJN5~}c98iyWV=F3&2bho-BT+-JN70v1 z9}?DKvs#f)h58Nt(ZH7XkG2}6e$|J7ah*ru@(JonA*<|{7aELwQg~Ptnm|t)ww>53 zk>J1mh<1Oj4dXlv{wpXsYzR&Osah-NhRO8kkyElZvMquc zdYqxW=G4?y*aM}AbO-{382d2 zR1?gKWNu?q&Nb$T3wL!GBgAJt%bs@P9#52sYRAF&11`UxW7W7Lq}1Mmm1)ah*l>Ebtelp2|w$vTrq|RGy@yXWX61zQjAMX z(H$MCpgbEcFmv28TqwG}Jtz2QdgqeI(q9r!$)Y&KwPjTk$<3u}_B4zqPkyBfC_=1q zUZ)bev3a(Jx1zlZC8$O=pE=QlvOjNUEuT#zH`7n5SLh?C{#rzh8tT1OM;Wrimw%=3zztolQ?=jf}h@#Ts&nq}1VgVADtg3Et z17vf>yvb|T3~*VuIJK&xW|jHz3qUe%A%!02k*85f7Zi$^iZ9ecilJcW3oWaty!Aqr z!anh+lxg_bAKeYa^T@z}R7Z>I3jJRA%Pql-P*!ZW53gJp+Y z3q4lSy><$}`geuY+1cYaNtk9%)yQvOktceAB@J%L6+I=U`YI~o%k_@AGb%$#pyrWY z(trz2`^Tnevq`an`>WpsupCcM6OhJ+SSIFykcv)`Q)@HKNhui9onY&EB(d7NgP?Sz)dbTsc&cq_iPSVY~N^W zD5(;xs@VuYG4qO$geQ5gjPo(Y4W%?$(_s$;l4}zhY-i~{i>Dm3RMd^*X z;z?k0b{ZFTdKD7)ub9)+AsZeL({qQRl$Sga+9bW@DNF32XmC1<-B?`ASJCcV6>ku^ zH(#u|hk_a<=HRiHpsHlfHn3B6nlm~Uke+~iHGe^$7*AIsWNthH!Ipy8iF%J)Z$!P9 zIT&#`T2Ijfl}l0mKxcZ02l%FX*$mmen!wh-xU`ABCn{>jO~85UGz}E8t}VX64Gg|1 z!yD7|9DyoYiT0C5*T3an2{2FI+3rSyt=PaqZroA}y4`3|!tF5`0(cJ0GY%LFkI7_E z4vC>YG)LAJAc96KS2g!3yh+`Ck{Eq;P${azMi!wV2imew51%UVW0yvBxvZsZge?Hm zr?pmy=Ud?_QIwk1M1qc{Z{%H=B`l|uCeM)070@@)GSE&_D{WBTU6feAu8~>|pM=eV zspt;I-0I&#RmAlUhQyx}Op}FcZ1)&_6fbtSTJUSnm`<=xx#5&gdP)Duw$Qo^JYFQC z4{&^;fUH9Opv5(iRxCOJ7*6s~ILP~dN<8}kf?&E0L-TK!oNPtxpGM+0t}-emQG=#qdp*^8fw=S8>{kss^wQUl37v);5I!M< zvZJ$v*~Caoj8O=YY$?;xbAtD$S#TC=mb^Y62eczkZbMjj>$&&w2;-2m6_{N zs>shZIHW$avz-00&=Pz&nHOFuoN)7^wzeeJjsAFjyA@FZ&s=mH9`{x_Ec7=|GoJMixJqV3 z-m%U@yS$*l=~79QA0Vj;U50&mCX3#>90W7=sSfq+fKt&3tz(&(Ue5Hn4I*>eNMH+M z61&oSDR71ez1(USq73z_3(H*}QkkVpV5$9a@y#0#;`<>{WB`m_@8r4NO~9p7w>{qs zsII2H-x@Fb?Fc!0&2oJsj^xCwG{xmg{nE!5i`)G^qsg@w)L0b1|0?xg4E*us*R=Vl zPZoD+zs1Fq94pCQB=e7-t0hoaOiVfCiasBy>Mx*wgu*?wViH+GgK+sf%8$J&XVa2b zC2O1_JCQYhZGCcn>MQ%A_OX<6b@%sAvo{(1@s93=wsYX@Z0pB+zcG}9JD%1oT-KT{ zfWuG&zNsu_*chObHp7Z`GiW0rJ{!0Qgh+b)cz9!Uj>Ae!? z%Va^!&v?D6b8|h69zw9b0=BwHn$=GjK@5y}mPEGWy}DP4_%p6<9C?ZDqBEhrLox1t z*?v%`b1!;Rkd*XxDsr{qQ>RX15u6j1U`>LnGXUVkmq{yPhtfn}czm7XHi?_89qx5Z*RqG@&}e}B(PB?a^ny%s)-w;caT)m$*T*;1< zXKV$c^KVpt4z(Jk0dL70RiHS7=V!fZo>O(L1D4Z>4*Il*L(MZVMG0Z`4ad7}edJ#V z40Q%$0fG3yREykTM#ohL0$8}pz}{wza0 zN3hIof0%ZitqXS3KkJ7mCGW%%e3GD8J?#Ou2A={2e(yB-mwW!bQ|6yFg28Y>SLNc# z7|EZk*c#eulYm9Z5(>o$vZsYMtc&A*2THVDFkiz*h}VCKO_<3qp+m@Q06{ zq@A?d4`NNK+{PifZ4!5Aj-{ZN7 zB2cqN9Ab!!4`xja8O*0;i%5iJ(04NZSbk-k+%H@V`$vfv`6~FqjML>@*@}rrv%_b< z!zYpITg+0IR!Raj8=4cTXdU`0%Y6NLEHcE2{z?0jKya2`|CH;_YSiuO$!Bc`(@LMW zaiM-{zDD2suDyXXhj$TPz`d6z&^mjfJma=hGUD<%;t+M*S;c3t!K6xCgS?Rl5XiJt z>hRyyUh~Fr#Pt^8*PV)9t}F_SEazO6Z`K;a%Fxn+I5zDtJL(gR8Cvi-W{bigjS^)0 zv63}q#kZpVMcrn#af@$b&ke!Ci9x>Yamr(a(mX|vBaVNa;|bp`N+sH-HZ*tjxFIZy6l>%UG((qA*;9tS=nicoV5@)BTkSWb90 zFIK}{0Fd76tQqelgN??rNI)t)K2`@QEX54SgUY zaMTGhjSaTxpp(D1K9(-ZM*Q9Y`+pyu=&#R^M;D|Nrm+IR_Yo5~W)wQZSJRfv`(<7Efx2S|^0Xdd_I3--M(kzTL6g1L8kb+x)c@LLu$<|f1h#UAsw0o6?2xMW z)To!1_7D8O_g$lfkBN`VYb$Q=0*xuYhQWc23KO306x`kT>Z$G~t<*jte|I?SZ{nw@gS*Fn?C@xO%>~<@mJBPo)*L zOdfKwdE|%pW$|onX|iZN>v@HXGPH3OMM;2cOp1u`jp#FKo!LbON+4hO)%e2{B(XO3 zD$}I67IMp=oD-Kb&i_oGE{zeZ zf&o3HUWma#V;n{1YHRXcwSWwm-5H4!zcD0exwlU z%mZvrj&Bwp-YN_wA3BCB#ar}F65{F0s=_2K`p{qFE1#AUuhpMoNhvPxC^{DGxv6Re zkw2?$bP3WVCzKDNit=l(&X1kc#YZ`hN`#m6h-O);N8!16w72ZjFe5ouY-w{OEX z6Uev!zWxa)dO&OCu@N~tu{+B+_;OKq;;w9r^P~xLTr3r_JM4LINT&L17myEAIOdQ9 zz1A*pehj^MUEAT$*P86JbeqrC*k_)yq`U77uI~5Z!PF#Sci%GVi4Pax$KkD_*QRJ! zYn@WI!$K?LkQ=9Ua#ST$sTinF73PO?Ig~6{!x$*i4e$mOg7iP z(u9A*2g=2ny1pW0Wa=uv4L6ee{wji*j)JSG$#pM1pC!@Y1)GJy5n$l9t)^hMs}NP3 z3PApK{s2}US>^o3+FUBJXA%|Zh{<|TehHnJT38>{eI%0RBN(&>m01;Wft-uH)p7soOaj<0XzIfnovcl>&PdQsvKT8BmGQCoD=*=MU{jz`|W4^ZZhjPDVN%q>+3*oe%|Jv-~4z zmVvsqfFs}n4-{RVk0!C|pZzpR^0XgSGum(O=tP_Y0L}rvs~E!Bqa?AudH?;%Qw&ashJ~em!Gn=FW98pYA*B zU{k2iAeH(6u;Hjxq%&SAjxKd*esw5_&PnGm{c#ThVWuJgJjE969&?roml;g>sv9vP zm)V5SZ+*@sA!L&RJiq8o!zxxIx0~Pi2w;WbF7EW-gm2P#(fPbf#QsZug1>H&JhaVH z0)PA1^sFj8|Fl{RqgB}tA4qf&?Jk6}LNR2i$>PpE-p;h`>4^16FukP?xixC15*dH0PP6=>pg~j&Fzih{b*O z#De$SQQAU5sMePv$>;$bidOP6%-3DULDVf2MG1f3ND6C0K%sbMaVQj|Sam?3^&P`~ z63Lg@^&8JfkBCJvI616D5m~~&6>)Nuf3n+kLCk7cZ+z}%^{VmSBU0(4P1=WVp$4@B z@n=<6Se6XRhtDcgk=fd8K@T8g5pH3#7ihPblynfyd4p7U+He5&r+;NQ!L>DUd6)m9 z3ef}87RZ|c=~^NnZ%^LTv9v+mw3TZ5KL4CCK_Iw~Pc8!qNfN@MM!0p?|4L#ErySV; zU4y|=)A z+_^FZMaEvGjCiu%S=He&rJf=8uoqS?Q4lNpoH5sB!_{c;7nqq8sJQb{2<~-{@eSp$ zuH3G!h7SK>B}58oxp6aREx3KDD{Rk+`ebs8%_Vcxo>&mKJiviX~-{J|av$u-@V!$~MD*f`K_ zpN`%Wxq9ahjn}F)Gekka1;myY&7^!(( zfD2h0>~Qgqx^r5uCwiZo8<+%)sJw%mtg-9;pm~5@R*9|QWx0MDB=f-dL58b(;kpJi zJJIZ=WZ!3mf&Me-{4BG}bFtsWtjYi}2#`-6LdN|{#D--YsNf0fkh=o!c9oSVQh{}5 zK>K!IF3NKS_H zJ_&FlzwNetS7Y-^(n?1TOJeO4Esyxfvis&*39*bC8uD*rEtmg?sFm7*LZll;J z1eWxR7Sf3RuhpR;M>C!i*Me>Cb-O5p%x$-5Mm(->L0kB%X zrS&SU^R`Pr{ZlUD$gnr2X2|?}ZQ@7CX{-BZJ_LIh@h|{5@{v2|ItaR5-&fb!H&C{U zQg;-3-p>#SJ0!7*%$LfP-Av%$-)>aSZyVP}<9~1*?a%r)u#u&>00Q-2S;BH8V0Rt* z`3BNQnS`2Dgw>tBzr|Iy;mr$x$UcWYm1$tzl$Qr*eJ?0VTtc{?|EIREj;dpM8oam$ zcXtVH;X-hCcXxLS65JCYxVtC8JwTA)L4rHMJ-A!==H|V(`@Y||=j=bbbxz;v?y9M_ z>bX5tGu=7N!FX!to#-1-){griTm$;e)ZC8f)=$gyb3{UNJ0-D`c|I1Gqh*4fTyB{Z zyGskFw(<9Ap*vEcluX@F;5gX$JF->Ms@xr@P{OUZ3aJQg8VGx@R?N|4-;ka8|NPFR zV575<`6SKY8K7-zf_4rVa;K&eCb=tw-+%wx?p5Z>e}>N(&!( zJ|n(@b?REd;iQsRfK!S{`ms;Rn%@GgC6&Agt48liD@`o6eTQ-4U;2 z3CA4Ht=yedz_g|H`)BBjk&wnSy9L5nbQ|+b)ta29Io+{|m>mJ>U5^&0f?O`yQ!ll|O-j!n-jIf{73(hs z(B=z2H7X_Hg>Yn^9=-lL^Am=&k(OVFQX;{*e4Hevc(CIqR1C9#Fp&ns*poO4n6D)S zW=y?`4W8eIM4H5*mBmsOeN*FpRX8^i?RyLh%Ry5hqhWHmIoEp!;wmK?jd8$hG07eC z@r^wkwx!RueZXCN!;4+5dePuZ$V-r;dWV3PDv93km{3Snl8|^2z-IC(Y53e-oNl2~ zfQ&>mS6stYbahRvIcUr_Q^w*p?kdPJDHJOZl#>EQ%QIV>(C zFG0kh<41udu6V!Ugi;(LP4$s;ok=`fQE&eBh=-Q#;~%6;fPw~!=6OJTuFFTRZ|}T6 zC(hT*u>2?-Mv{~gcFbQi=MLKVRHUU}lB{LO)G`WP5G}Vnw*P?_{mKUI4%Y6RF2$Hh zF#guZW44CM-N08!3~kC_FTe`+(gHaTvrkm;MKnj&R7U--#j`(lyC$kF_eVQ9-wyiw zNaDf1QfG$JH+fTjF5+bUGf_in3|*zqlp(|61M2pgema)(n5Gm2MO}$I#j_o2@8PF*T{jtI;eq=a)_;h82_djIkT9Mu3qQQaDyuS%zU z$P%<*j$Gh~Hn+@~%GN?3X3OrI$)%~=0?h|)Mb{MdkFJ|x`wX|2)dj@hY4>@UF9=~6 z4faZ~h9rFJS&7drpAHwKt`I*J(VGA45* zI#=4f-=rpcV!|J;?#&Z*OE%RVi-uaI1%>dg0C~K==j%MtR`JGfst)JN6!FztMyNt& zsQbz;11~JYgir22KCb2D6s$6deK^l&KR5_p5;^@EqW(NymxwgLwM=)fF!`K^b*xrz z5>9Kr93D&u&3y)aI90C7&C`U#Mil?Rx~98XV%Pz8k^ar?(lB=4K8I<#?MdXk6;6tM zi1ktounE2b*7!FPs%nGJXZoaPeVaR$HPKa*P%$=sn;Z!viffjJ5WaW|4&Im9y z5l_330rRZ_!t-2tb1+K>lAyR>&b*_>qc2I3dpzH7bFx6Gm+Gm**P6~uCR4I4^U9P< ziEEA<3KUV3Ik_|5BcLiRdCauEu$#FpI+Uov@j*9zRapX<(Gx2yO@7j6*9ixgUV;cf zIWn0AmiH8{y{8X&3_~s?1IULr$BJv7uf)`eIOptXzT*W`P3YbS2>%)}sd#=Nv8~LL z4h99PUHhTAqM#^Q8kbN4PO5u+P$rtD?3)W;mp6ctw-G(M zASzYKYzM69%mA0C{{+P4S^C?crSBn6{19!*IP8Uy#K2J5qUus6-uJ7qsA<7z^mGH^ zF>P-@!2<@qiF{ z341^({=k12q?Cr|pY46&lU*`n+0|I)@NruE+jW^A2n5%WRmckZrsSBjG!_hg$E5bl zQ__(Xpg3#3-0X=ccal<4M{h9V{7DmLJdsc1%e`t6fRH=yqnG0o`tIgY!Lj4S*enyo zT;x?IQQC~n*;M4zAE&r$lgq<%=GFu@0TkpVR9=#Y{t*+?45igYTfEvGhwR8Cbr#ZQ`t#Yj*20yC%&#epN$!ui@9gJ3t!qUt2lc@ zwFS9;@j9CReWTP*rjJILGWf;*nx`y~`an*ab9|NWeReD?mpUGMo41XWd_*eeW~^4| zR|BmmZvO|}`bd`awjT`zh%Tt{qsUv`Bfn1Yc9TU@Hs5aWj`TV`5{plK>Q)Kn<-gv88=2c6pDl~+u7HxB7F%@Lvpl0VG zo}Lt`+FTz+UG3W4_q#|rQ5y?<95~SF9C~r~q5ch@+LCH`IoOG-UwmM1DM3VeJfcud0VQaF;T41h*lGa^*Yo_j zweiW9bIB3l;{vD~sJs+A{pUL-to?isZ%yAMMxUD9N$mTw1D3kI8#J|rG()dTj0TLU z2Fguyj=T91yS|!CIgsJ6iHan^zWLXuSPRCwpvGOJj7?IfVh<=%mLuvB^ zVQ8g;4F~hLt>&*BD4E@4E?g0|hx3A+_I9o6W4cEk=sD!nRCb4-kF%OilkW~=zudIt z9y7cz%=~yA9f zd+om0%~*CZ3cezEs9kp(M)M zSAhNcfH?N8%l(-mw!;`Ic^CXdH)$ zBdn;%pSLZIBEVe5fy`h{z}qsOyhF+!pzKbh)7ZY&OM5uW{9>oayTin9UPIxB%@=kv z;qDP}d`cHi%{x0Wg6hIFONB9dP1F*4*;6BfknsfB>yHJ5h#0YKg3nF@rb4jw&Lx!L z->PQ>1af>oqedP(j}$QAjIo=Xj2=e>ysS>{d-r@zu9A-%dm=?=smF`;tBvJRMm(3W z2>)#QNiov$triVJbG>xW+O$Jt{s-&E4 zPho7>HBVo>KiUfMQCgD~ZHC$m&b8#@(BDT{r@EUfe(Rs_T7#cm2+%>M2iB^AqFEV#xs*@>=P@i3QB5P*CDru~ zy7SmdFi_E$xeQJGb%YIrN#^->;eWe`qf4&);kq(T`{2J|A%k)l;g&UZYoaHSMGLyX z<{*m*7Yqz~5H5|G^)yU(&&QuU!l60CVh+$?>gD)!!Nv{+55trB*=3(cbcf>!FN0fN z()~%H=|O)jk(u(x6uWI1GTGwEctIL_B=no3TNz{zj#^ZuXD}F6ZhJNRsn2m-6q#@! zeNovnMl`;)v!3(3q+`M~`lp}BSUW~W!1C7n6Kg?qCt!CfUg9yu_s||`dW4naB)YQ) zbr+Vq##%&&O=CUg-4)Lht9T*3%WxoxJxz^HB4hr;qdV3NrM6|rfUR6b{ z&^=LE0FUdWCA5GGR8RjC4k@wdkR52&Rw62hXDEr?XM;98C1%E9j}K-CEM3K#$?7nG(hF8@g@Ycl@jz zgan;chwfW>X%#&5tImWmEb06j75QSFAID^0yY`Qis_v1e*DvSvB*qm7sm0i*4QbyS z+9xnz)+_v~{?hwVgH4`mcazU{vYWpiBqd^MdNz*?qesL}y-EK0)q$=-Sl=((PcXac za(cA#as%Z@R?pEa8%aq`#)LuIDjY3QVq=kxI?m^VMuPlkh2^ka#sdTl&g1wT0romck;Vhm9>cCx5~X3lWD=jUgNsL`$&#KbTVH-C?Xtzxd>y2V z7Z2ze&l_jnC2d)H^BQ%pNVNI2kDr|@d?ozClABMq*Ku*V@!r{i4c}^j{FSz3G`N|eb%`2wTJVV zmU2PAoad=P3cfQY?2F=jtQIe`&L+lcbou8p!u^jwF(>M>o;*O@s4|N4wf5>>8K;3Y z10E|Dnl>{IepkQnVoVRaJ29uUWP{(Ahnp2K$Z}QP<^Ei1I4Y#Z*z={{$6zqNH2ocy zZwJ$82AncX%&}V}*EO@|UEr{vrBFM^_u;DQm+NPATuvhrRBvnH9l-N6T?|>WcL5KE z|MVPPAb2OkOQz!Nn0R7UjbA~trLJrHyQPBuKD6oX>ZZ{`^B`>h2uzQHC60={ym;d4 zat1x(398f(Ns7i53hRw+!K-C6PJKgoYdNRj7LG_ZO5ITtN&a#YR2oEP3^h>9tn#S$ zg^&}T3g|P^b>Fdl_vV}8L}PC*H?5NOM{1|tgBU*%UPGfgW;VvpTECNrWrrdfF&-*! z8P@h3qek5hkZGR3@i5D<8;h_-Ex!kGTXRK)%j%a=<(BogLQ}zDNAyoPo0B8npF5) zo(m_s4_z_vvnhK}Cr{&bWItHOo;hRJUu3j{`|WkAEJErt>B7U!DY>{OOS4L;UES>D z?N9}ri+bzhm(RSfuW2q4C`N^LxbQMW*ljCyL%x{7N#f{zP&QnTJc7@?aWl}gamb(W zOd<%~Qf57R6KZ^pDrhr|)T*`h#_~J5&JN~_^>`7Ahut(jW@J*dk?Rh10p7JrLE?P0 z(3>KDwif>bC<0B)1TiKH$Dy{iD5}>t#i$*Rq{bTunG3FlO}2&Ij^b~HP-}Hl6Eap( z;*ns;2DwGzRR|6vU>nb`G9rZw!CpoD9q*Q8-ilDDSlkCLqPaHA(6S3beG7Gs@{}$O zAlZ$x8O<`4++BK5xDwG{Hds@Wo31I>dKDr{L@haZs8MYulWU4K*E$=VdGpEOm`r2E z649wE{TY{cTEGSgr!Bgo7*2$oJYX-J$U6zm86nhc8A?oTTZ%+_73P(8c3KmimW(^v zM^`G!rOpNs+T%NOi{l;-NxAPr0#{lT1R-*g!q7jYx9+ftqBTmf!!b(^89R#~Dc^bR z@{mq^ffD=zZvM#Y;v|_EMEB^N=YPU=MY)v*N8MYkol)DmFP-^^mX*}%4*81)>d8M4#EWdyF7TPs`U!{ zs)%Xj<1zAo)_j4vq;9T1CGVqwZI6aH#kivu_1jgcVlb;7PEBP_BGpl7oUe1|J!+au zm-y9gyQ=B{wL(~722mb66;*LN#qk&U=wxv^rZ*(4_(#^V2vHTm@Mh} zkO7BDTkKC^dq9>rw9+pc;RP)m#8*wfNl=CPvtT<6xi#fx`m~N{D3iy$7uWZSye7lj zpD~P8mJ_FMT*0Xd-Qd3N%$2Vzbd%T0KED5qTVMgWZ4?7mtAG_8jau|-!nBtL4L`@W z<^@#akXS4BzvPxzZoGRy9BI}Ej<;RIX^pcnW?#u-$II4$MnvVWFjRWU#Q2up7mFin zow14WYxmqN>wH04aiv3JLwww#Cbk>fZw^JyiN0fIY5l$*XGFT6qVNdkOikO1Xai(* zxK)0%l_zidO%UX#<@wavX9dk-w znww0nh9ucN&9SxY`g>Xlycjp$>BGn5n&65wjwd}_?du+hrBXIZ$2s&=%7Hw%(1r!C zKkrpd8m}GcGZI`Dgtw;J;>fXlBGZk@Vj>o)IMaXGm*sE^3h1wPCNe{^n0=cE8@s&B z9?rP8iVUsg$;iEnhAi>LXg`&zcYvVz*LB7>PAOkP298FOYMj}yBVwww`_Py89W+%- zlaUeucb2&`_3aFl+8((A*3w@0u_Pa{FP}UMUmbRpT0e;*zc}AToN6Z;%3+1>Wv?vb zU`w1!fHIgA9VMDIOQ3rT_QU7DS~n|h?%8;?Mx3ncqe(u+sIkKsPcjqB1dRxj4ztdB z$`$oql9|Xh)GbUpblE~?Bk?V*BbUZA(Wpq`PE_2Zq5N*%%&>(wLA#_)^Hl4b*%o0O zzi4>Z6zYD1c>LT^*2Aj6cgczEcIzMlolWt;hDcJhD6QZg|nD{&Jx|VTt zs@~Pa6Ou`?nTs#jMFjz~o(Hf`JVgNZiG8fBrt?bVWFo=#*ggs>nd`3ey;eIB%=w`H zb>|Z9t#^gGk2)6q@|ASlEfPGWXh}@7L7L*BwY}#~S;Q;v*Y3FBW;IiWjIQlrN^23k z0>bNF-z#4b@N*7I^NwvPAZlBXWADcu(BL~%F}{)2Fdo05zNA#9q44Tl*3WztC;WN0 z@KvI+^YHQ{b<1tUcmf6V1Ic!i)fU&K-?tq~-R64t+Fm6)66J&2)bl6@ya@M88@jQ~ z@MwL0Wf|}{j{ATtV@eaHlPkCxLdMH5H`os(!4XGtSR32Ic}>CeM7uxR_55$WFm5RA zS9~<>+Yr0N&Xc=~!&?!?CcDxHRKN9YSs^htKvU5gKhnKno3QR<(|$xpaz9R7%+N6x z^SnlI&FCGbL*#%GI_Aa)=2Ze<+6XiSOdI9xEZ}7A2w#%-Z3@;wk3|7GqCB?s3G#JWTPx5n#HG&dy!{r-P%H zjRk=J1u%e_97hEN1!%>;a{qe*ApAEX2qf}v-v1eg222NG9&V=g0Lj3^{!f`O|6==F z8c^jM%+c1#3W(r%IsOs)cYPqN zAR3FmXe1VHmJl8YKzCm98A$N}QpKSxK&a2_AOdRuS1gEvBElfPNLa`ZgMdn@4ipI9 z=5TYEs_W_Y79s)1%){-kK7}T~1wjG`MBwi6r@tX|NZ~}lHr|jp4$ut9I6wfB4+-<; z|C^WauRLH({?H)(4&nJH4M{@Me`Eoe;6Kv;H~#?ge;q01T)L%pFi5Fb3w%e-{j? z6FU%0JuN&;0j%q2`3FPDl>gRqU~agXy0}18@qYoc?O&6L=A(x*L?DTU>7R=sLO}|q PURJL-Sb14F*xCOJAC9)u literal 0 HcmV?d00001 diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 6572616769a91..6ed21de368ac3 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -18,6 +18,7 @@ from .utils import cosine_similarity, create_video_from_image, normalize_image pytestmark = pytest.mark.cpu_test +ASSETS_DIR = Path(__file__).parent / "assets" NUM_FRAMES = 10 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3) FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3) @@ -140,3 +141,39 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str): ) assert np.sum(np.isnan(sim)) / sim.size < 0.001 assert np.nanmean(sim) > 0.99 + + +def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): + """ + Regression test for handling videos with broken frames. + This test uses a pre-corrupted video file (assets/corrupted.mp4) that + contains broken/unreadable frames to verify the video loader handles + them gracefully without crashing and returns accurate metadata. + """ + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv") + + # Load the pre-corrupted video file that contains broken frames + corrupted_video_path = ASSETS_DIR / "corrupted.mp4" + + with open(corrupted_video_path, "rb") as f: + video_data = f.read() + + loader = VIDEO_LOADER_REGISTRY.load("opencv") + frames, metadata = loader.load_bytes(video_data, num_frames=-1) + + # Verify metadata consistency: + # frames_indices must match actual loaded frames + assert frames.shape[0] == len(metadata["frames_indices"]), ( + f"Frames array size must equal frames_indices length. " + f"Got {frames.shape[0]} frames but " + f"{len(metadata['frames_indices'])} indices" + ) + + # Verify that broken frames were skipped: + # loaded frames should be less than total + assert frames.shape[0] < metadata["total_num_frames"], ( + f"Should load fewer frames than total due to broken frames. " + f"Expected fewer than {metadata['total_num_frames']} frames, " + f"but loaded {frames.shape[0]} frames" + ) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 369c5e6cb4d10..5c75bee54dd30 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -63,6 +63,63 @@ class VideoLoader: ) -> tuple[npt.NDArray, dict[str, Any]]: raise NotImplementedError + @staticmethod + def _read_frames( + cap, + frame_indices: set[int], + num_expected_frames: int, + max_frame_idx: int, + ) -> tuple[npt.NDArray, int, list[int]]: + import cv2 + + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8) + + i = 0 + valid_frame_indices = [] + for idx in range(max_frame_idx + 1): + ok = cap.grab() + if not ok: + # Frame is broken/unreadable, log warning + if idx in frame_indices: + logger.warning( + "Failed to grab frame %d during video loading. " + "This frame will be skipped.", + idx, + ) + continue + if idx in frame_indices: + ret, frame = cap.retrieve() + if ret: + frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + valid_frame_indices.append(idx) + i += 1 + else: + # retrieve() failed even though grab() succeeded + logger.warning( + "Failed to retrieve frame %d during video loading. " + "This frame will be skipped.", + idx, + ) + + valid_num_frames = len(valid_frame_indices) + if valid_num_frames < num_expected_frames: + logger.warning( + "Video loading completed with %d broken/unreadable frames. " + "Expected %d frames but only loaded %d frames.", + num_expected_frames - valid_num_frames, + num_expected_frames, + valid_num_frames, + ) + + assert i == valid_num_frames, ( + f"Expected reading {valid_num_frames} frames, " + f"but only loaded {i} frames from video." + ) + + return frames[:valid_num_frames], valid_num_frames, valid_frame_indices + VIDEO_LOADER_REGISTRY = ExtensionManager() @@ -120,24 +177,10 @@ class OpenCVVideoBackend(VideoLoader): ) frame_idx = uniform_sampled_frames.tolist() - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8) - - i = 0 - for idx in range(max(frame_idx) + 1): - ok = cap.grab() - if not ok: - break - if idx in frame_idx: - ret, frame = cap.retrieve() - if ret: - frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - i += 1 - - assert i == num_frames_to_sample, ( - f"Expected reading {num_frames_to_sample} frames, " - f"but only loaded {i} frames from video." + # Convert to set for O(1) lookup performance + frame_idx_set = set(frame_idx) + frames, valid_num_frames, valid_frame_indices = cls._read_frames( + cap, frame_idx_set, num_frames_to_sample, max(frame_idx) ) # Use transformers transformers.video_utils.VideoMetadata format @@ -148,10 +191,10 @@ class OpenCVVideoBackend(VideoLoader): "fps": original_fps, "duration": duration, "video_backend": "opencv", - "frames_indices": list(frame_idx), + "frames_indices": valid_frame_indices, # extra field used to control hf processor's video # sampling behavior - "do_sample_frames": num_frames_to_sample == total_frames_num, + "do_sample_frames": valid_num_frames == total_frames_num, } return frames, metadata @@ -185,10 +228,10 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): # Refer to: # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140 - frame_indices: range | list[int] + frame_indices_list: list[int] if duration <= max_duration: n = int(math.floor(duration * fps)) - frame_indices = sorted( + frame_indices_list = sorted( { min(max_frame_idx, int(math.ceil(i * original_fps / fps))) for i in range(n) @@ -197,34 +240,23 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): else: num_samples = int(max_duration * fps) if num_samples >= total_frames_num: - frame_indices = range(total_frames_num) + frame_indices_list = list(range(total_frames_num)) else: target_seconds = np.linspace(0, duration, num_samples, endpoint=True) - frame_indices = sorted( + frame_indices_list = sorted( { min(max_frame_idx, int(math.ceil(t * original_fps))) for t in target_seconds } ) - width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) - height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) - frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8) - - i = 0 - for idx in range(total_frames_num): - ok = cap.grab() - if not ok: - break - if idx in frame_indices: - ret, frame = cap.retrieve() - if ret: - frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - i += 1 - - assert i == len(frame_indices), ( - f"Expected reading {len(frame_indices)} frames, " - f"but only loaded {i} frames from video." + # Convert to set for O(1) lookup performance + frame_indices_set = set(frame_indices_list) + frames, valid_num_frames, valid_frame_indices = cls._read_frames( + cap, + frame_indices_set, + len(frame_indices_list), + total_frames_num - 1, ) # Use transformers transformers.video_utils.VideoMetadata format @@ -233,7 +265,7 @@ class OpenCVDynamicVideoBackend(OpenCVVideoBackend): "fps": original_fps, "duration": duration, "video_backend": "opencv_dynamic", - "frames_indices": list(frame_indices), + "frames_indices": valid_frame_indices, "do_sample_frames": False, } From 64192d562402a56dc1e3a2141cfe896a7f0b52e9 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 20 Nov 2025 13:23:22 +0800 Subject: [PATCH 054/249] [Bugfix] Revert custom attention mask for gemma3-mm (#28995) Signed-off-by: Isotr0py Co-authored-by: Cyrus Leung --- vllm/config/model.py | 5 - vllm/model_executor/models/gemma3_mm.py | 138 +----------------------- vllm/transformers_utils/config.py | 11 -- vllm/v1/worker/gpu_model_runner.py | 19 ---- 4 files changed, 1 insertion(+), 172 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index d1e56a72a318b..97cba6ea7295e 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -32,7 +32,6 @@ from vllm.transformers_utils.config import ( try_get_generation_config, try_get_safetensors_metadata, try_get_tokenizer_config, - uses_custom_attention_masks, uses_mrope, ) from vllm.transformers_utils.gguf_utils import ( @@ -1625,10 +1624,6 @@ class ModelConfig: def uses_mrope(self) -> bool: return uses_mrope(self.hf_config) - @property - def uses_custom_attention_masks(self) -> bool: - return uses_custom_attention_masks(self.hf_config) - @property def is_multimodal_model(self) -> bool: return self.multimodal_config is not None diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index fe83c8b63b018..43c69e5e13992 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -596,7 +596,7 @@ class Gemma3ForConditionalGeneration( def get_language_model(self) -> torch.nn.Module: return self.language_model - def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return [] @@ -644,142 +644,6 @@ class Gemma3ForConditionalGeneration( return hidden_states - def generate_attention_masks( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - mask_dtype: torch.dtype, - ) -> dict[str, Any]: - """Generate custom attention masks for Gemma3 multimodal inputs. - - This is called by V1 engine's gpu_model_runner during preprocessing - to generate attention masks that allow bidirectional attention between - image tokens while maintaining causal attention for text. - """ - # NOTE(woosuk): Here, we distinguish the sequences by the position id 0. - # This is a HACK. Fix this. - start_indices = (positions == 0).cpu().nonzero() - num_seqs = len(start_indices) - seq_lens = [] - for i in range(num_seqs): - start_idx = start_indices[i] - end_idx = start_indices[i + 1] if i < num_seqs - 1 else len(input_ids) - seq_lens.append(end_idx - start_idx) - - global_attn_masks = [] - local_attn_masks = [] - start_idx = 0 - for seq_idx, seq_len in enumerate(seq_lens): - end_idx = start_idx + seq_len - input_token_ids = input_ids[start_idx:end_idx] - - # Find image token positions - img_pos = input_token_ids == self.config.image_token_index - - start_idx = end_idx - - # Create a global causal mask - global_attn_mask = torch.empty( - 1, - 1, - seq_len, - seq_len, - dtype=mask_dtype, - device=input_ids.device, - ) - global_attn_mask.fill_(float("-inf")) - # Fill the lower triangle with 0 (causal attention) - global_attn_mask = global_attn_mask.triu(diagonal=1) - - # Enable bidirectional attention between image tokens - img_mask = torch.zeros_like(global_attn_mask) - img_mask[:, :, :, img_pos] += 1 - img_mask[:, :, img_pos, :] += 1 - global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask) - global_attn_masks.append(global_attn_mask) - - # GGUF compatibility: config might be Gemma3TextConfig directly - text_config = getattr(self.config, "text_config", self.config) - sliding_window = text_config.sliding_window - if sliding_window is not None: - # Create a local causal mask with sliding window (1024) - local_attn_mask = torch.ones_like(global_attn_mask) - local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window) - local_attn_mask = torch.where( - local_attn_mask == 0, global_attn_mask, float("-inf") - ) - local_attn_masks.append(local_attn_mask) - - return { - "has_images": True, - "seq_lens": seq_lens, - "global_attn_masks": global_attn_masks, - "local_attn_masks": local_attn_masks, - } - - def prepare_attn_masks( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - mask_dtype: torch.dtype, - **kwargs, - ): - kwargs["has_images"] = True - # NOTE(woosuk): Here, we distinguish the sequences by the position id 0. - # This is a HACK. Fix this. - start_indices = (positions == 0).cpu().nonzero() - num_seqs = len(start_indices) - seq_lens = [] - for i in range(num_seqs): - start_idx = start_indices[i].item() - if i < num_seqs - 1: - end_idx = start_indices[i + 1].item() - else: - end_idx = len(input_ids) - seq_lens.append(end_idx - start_idx) - kwargs["seq_lens"] = seq_lens - - global_attn_masks = [] - local_attn_masks = [] - start_idx = 0 - for seq_len in seq_lens: - end_idx = start_idx + seq_len - input_token_ids = input_ids[start_idx:end_idx] - start_idx = end_idx - # Create a global causal mask. - global_attn_mask = torch.empty( - 1, - 1, - seq_len, - seq_len, - dtype=mask_dtype, - device=input_ids.device, - ) - global_attn_mask.fill_(float("-inf")) - # Fill the lower triangle with 0. - global_attn_mask = global_attn_mask.triu(diagonal=1) - - # Consider the bidirectional attention between image tokens. - img_mask = torch.zeros_like(global_attn_mask) - img_pos = input_token_ids == self.config.image_token_index - img_mask[:, :, :, img_pos] += 1 - img_mask[:, :, img_pos, :] += 1 - global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask) - global_attn_masks.append(global_attn_mask) - - sliding_window = self.config.text_config.sliding_window - if sliding_window is not None: - # Create a local causal mask with sliding window (1024). - local_attn_mask = torch.ones_like(global_attn_mask) - local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window) - local_attn_mask = torch.where( - local_attn_mask == 0, global_attn_mask, float("-inf") - ) - local_attn_masks.append(local_attn_mask) - kwargs["global_attn_masks"] = global_attn_masks - kwargs["local_attn_masks"] = local_attn_masks - return kwargs - def compute_logits( self, hidden_states: torch.Tensor, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4ca155af03dca..df24738477e76 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -520,17 +520,6 @@ def is_interleaved(config: PretrainedConfig) -> bool: return False -def uses_custom_attention_masks(config: PretrainedConfig) -> bool: - """Detect if model uses custom attention mask generation for multimodal. - - Some multimodal models require custom attention masks that enable - bidirectional attention between image tokens while maintaining causal - attention for text tokens. Currently applies to Gemma3 multimodal models. - """ - architectures = getattr(config, "architectures", []) - return "Gemma3ForConditionalGeneration" in architectures - - def _maybe_update_auto_config_kwargs(kwargs: dict[str, Any], model_type: str): """ Update kwargs for AutoConfig initialization based on model_type diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 80f8344d44100..0490ed39c8c78 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -324,7 +324,6 @@ class GPUModelRunner( # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope - self.uses_custom_attention_masks = model_config.uses_custom_attention_masks self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( model_config ) @@ -2352,24 +2351,6 @@ class GPUModelRunner( **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), } - - # Generate custom attention masks for models that require them. - # V1 pre-generates embeddings, so forward() skips prepare_attn_masks(). - # Check mm_features (mm_embeds is empty during decode). - has_mm_features = any( - req_state.mm_features for req_state in self.requests.values() - ) - if ( - self.uses_custom_attention_masks - and has_mm_features - and hasattr(self.model, "generate_attention_masks") - ): - mask_kwargs = self.model.generate_attention_masks( - self.input_ids.gpu[:num_scheduled_tokens], - self.positions.gpu[:num_scheduled_tokens], - mask_dtype=self.model.dtype, - ) - model_kwargs.update(mask_kwargs) elif self.enable_prompt_embeds and is_first_rank: # Get the input embeddings for the tokens that are not input embeds, # then put them into the appropriate positions. From a9705a290af05ad71023714074ad8bf1a50c60a3 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Thu, 20 Nov 2025 06:04:23 +0000 Subject: [PATCH 055/249] [Model][QwenVL] Replace `torch.repeat_interleave` with faster `np.repeat` (#28964) Signed-off-by: Lukas Geiger --- .../models/multimodal/generation/test_qwen2_vl.py | 14 ++------------ vllm/model_executor/models/qwen2_vl.py | 15 +++++++++------ vllm/model_executor/models/qwen3_vl.py | 12 +++++++----- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index e10b8e1e77af1..e1b7dbf99f1fd 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -128,12 +128,7 @@ def batch_make_image_embeddings( visual = model.visual pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype) - image_grid_thw_on_device = image_grid_thw.to( - visual.device, dtype=torch.int64 - ) - return visual( - pixel_values_on_device, grid_thw=image_grid_thw_on_device - ).cpu() + return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu() image_embeds = torch.concat(llm.apply_model(get_image_embeds)) @@ -217,12 +212,7 @@ def batch_make_video_embeddings( visual = model.visual pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype) - video_grid_thw_on_device = video_grid_thw.to( - visual.device, dtype=torch.int64 - ) - return visual( - pixel_values_on_device, grid_thw=video_grid_thw_on_device - ).cpu() + return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu() video_embeds = torch.concat(llm.apply_model(get_image_embeds)) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d25ff2785bfef..479a7871e364f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -29,6 +29,7 @@ from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial from typing import Annotated, Any, Literal, TypeAlias +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -751,25 +752,27 @@ class Qwen2VisionTransformer(nn.Module): if isinstance(grid_thw, list): grid_thw_list = grid_thw - grid_thw = torch.tensor(grid_thw, dtype=torch.int32) + grid_thw = np.array(grid_thw, dtype=np.int32) else: grid_thw_list = grid_thw.tolist() + grid_thw = grid_thw.numpy() # compute position embedding rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) # compute cu_seqlens - cu_seqlens = torch.repeat_interleave( - grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] - ).cumsum(dim=0, dtype=torch.int32) - cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) - cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) + cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + axis=0, dtype=np.int32 + ) + cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) + cu_seqlens = torch.from_numpy(cu_seqlens) # transformers x = x.unsqueeze(1) # pre-compute seqlens for attn mask to reduce cuMemcpy operations max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) for blk in self.blocks: x = blk( x, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index c10aeaec5ab83..90c4894d33e88 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -553,18 +553,20 @@ class Qwen3_VisionTransformer(nn.Module): if isinstance(grid_thw, list): grid_thw_list = grid_thw - grid_thw = torch.tensor(grid_thw, dtype=torch.int32) + grid_thw = np.array(grid_thw, dtype=np.int32) else: grid_thw_list = grid_thw.tolist() + grid_thw = grid_thw.numpy() pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list) hidden_states = hidden_states + pos_embeds rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) - cu_seqlens = torch.repeat_interleave( - grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] - ).cumsum(dim=0, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) - cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens]) + cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + axis=0, dtype=np.int32 + ) + cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens]) + cu_seqlens = torch.from_numpy(cu_seqlens) hidden_states = hidden_states.unsqueeze(1) max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) From 1c7bcc55b86d6cb867072dfb890dec6c7e747a1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Wed, 19 Nov 2025 23:20:12 -0700 Subject: [PATCH 056/249] [Frontend] Allow parsed tool arguments (#28820) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/entrypoints/chat_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3b722c2d92770..03214c4d131bc 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1437,7 +1437,8 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: for item in message["tool_calls"]: # if arguments is None or empty string, set to {} if content := item["function"].get("arguments"): - item["function"]["arguments"] = json.loads(content) + if not isinstance(content, (dict, list)): + item["function"]["arguments"] = json.loads(content) else: item["function"]["arguments"] = {} From 20e4497be23f8e74882bfb0bd0db3d30dd821afc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 20 Nov 2025 14:39:10 +0800 Subject: [PATCH 057/249] [V0 Deprecation] Remove `num_lookahead_slots` (#29000) Signed-off-by: DarkLight1337 Co-authored-by: Michael Goin --- vllm/config/scheduler.py | 9 --------- vllm/config/speculative.py | 10 ---------- vllm/engine/arg_utils.py | 11 ----------- 3 files changed, 30 deletions(-) diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 8194295ffedb6..b6078706daacf 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -62,15 +62,6 @@ class SchedulerConfig: """For chunked prefill, a request is considered long if the prompt is longer than this number of tokens.""" - num_lookahead_slots: int = Field(default=0, ge=0) - """The number of slots to allocate per sequence per - step, beyond the known token ids. This is used in speculative - decoding to store KV activations of tokens which may or may not be - accepted. - - NOTE: This will be replaced by speculative config in the future; it is - present to enable correctness tests until then.""" - enable_chunked_prefill: bool = True """If True, prefill requests can be chunked based on the remaining `max_num_batched_tokens`. diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 13a8632413d91..a0c65b6049e1e 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -634,16 +634,6 @@ class SpeculativeConfig: return self - @property - def num_lookahead_slots(self) -> int: - """The number of additional slots the scheduler should allocate per - step, in addition to the slots allocated for each known token. - - This is equal to the number of speculative tokens, as each speculative - token must be scored. - """ - return self.num_speculative_tokens - def use_eagle(self) -> bool: return self.method in ("eagle", "eagle3", "mtp") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 68205b6079d78..74828bc109cbe 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -488,7 +488,6 @@ class EngineArgs: ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override - num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config") ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns") @@ -1081,9 +1080,6 @@ class EngineArgs: "--long-prefill-token-threshold", **scheduler_kwargs["long_prefill_token_threshold"], ) - scheduler_group.add_argument( - "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"] - ) # multi-step scheduling has been removed; corresponding arguments # are no longer supported. scheduler_group.add_argument( @@ -1653,18 +1649,11 @@ class EngineArgs: target_parallel_config=parallel_config, ) - # make sure num_lookahead_slots is set appropriately depending on - # whether speculative decoding is enabled - num_lookahead_slots = self.num_lookahead_slots - if speculative_config is not None: - num_lookahead_slots = speculative_config.num_lookahead_slots - scheduler_config = SchedulerConfig( runner_type=model_config.runner_type, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, - num_lookahead_slots=num_lookahead_slots, enable_chunked_prefill=self.enable_chunked_prefill, disable_chunked_mm_input=self.disable_chunked_mm_input, is_multimodal_model=model_config.is_multimodal_model, From 7218f83992c7d61fc3845ea24407a1f3b909713e Mon Sep 17 00:00:00 2001 From: Pleaplusone Date: Thu, 20 Nov 2025 15:50:23 +0800 Subject: [PATCH 058/249] [ROCm][BugFix] Fix shared expert loading error when disable `VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS` (#28633) Signed-off-by: ganyi --- vllm/model_executor/models/deepseek_v2.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index c50fc327e7608..d0a116b97997a 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -287,7 +287,10 @@ class DeepseekV2MoE(nn.Module): ) self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - if config.n_shared_experts is None or self.is_rocm_aiter_moe_enabled: + self.is_fusion_moe_shared_experts_enabled = ( + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + ) + if config.n_shared_experts is None or self.is_fusion_moe_shared_experts_enabled: self.shared_experts = None else: intermediate_size = config.moe_intermediate_size * config.n_shared_experts @@ -327,7 +330,7 @@ class DeepseekV2MoE(nn.Module): num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, n_shared_experts=config.n_shared_experts - if rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + if self.is_fusion_moe_shared_experts_enabled else None, ) From 1e1c06789e63a760d91aaf6e4ddfeabfe382c301 Mon Sep 17 00:00:00 2001 From: Bradley D Date: Wed, 19 Nov 2025 23:53:38 -0800 Subject: [PATCH 059/249] [ci][amd] fix EPLB execution test (#28742) Signed-off-by: Bradley Davis --- tests/distributed/test_eplb_execute.py | 387 +++++++++++++------------ 1 file changed, 195 insertions(+), 192 deletions(-) diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py index 7b45ae82c72d4..0a97749ac318c 100644 --- a/tests/distributed/test_eplb_execute.py +++ b/tests/distributed/test_eplb_execute.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import multiprocessing import os import random import pytest import torch import torch.distributed +import torch.multiprocessing as mp from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( @@ -17,10 +17,12 @@ from vllm.distributed.parallel_state import ( ) from vllm.utils.system_utils import update_environment_variables +mp.set_start_method("spawn", force=True) -def distributed_run(fn, world_size): + +def distributed_run(fn, world_size, *args): number_of_processes = world_size - processes: list[multiprocessing.Process] = [] + processes: list[mp.Process] = [] for i in range(number_of_processes): env: dict[str, str] = {} env["RANK"] = str(i) @@ -29,7 +31,7 @@ def distributed_run(fn, world_size): env["LOCAL_WORLD_SIZE"] = str(number_of_processes) env["MASTER_ADDR"] = "localhost" env["MASTER_PORT"] = "12345" - p = multiprocessing.Process(target=fn, args=(env,)) + p = mp.Process(target=fn, args=(env, world_size, *args)) processes.append(p) p.start() @@ -40,24 +42,16 @@ def distributed_run(fn, world_size): assert p.exitcode == 0 -def worker_fn_wrapper(fn): - # `multiprocessing.Process` cannot accept environment variables directly - # so we need to pass the environment variables as arguments - # and update the environment variables in the function - def wrapped_fn(env): - update_environment_variables(env) - local_rank = os.environ["LOCAL_RANK"] - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - init_distributed_environment() +def set_env_vars_and_device(env: dict[str, str]) -> None: + update_environment_variables(env) + local_rank = os.environ["LOCAL_RANK"] + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + init_distributed_environment() - # Ensure each worker process has the same random seed - random.seed(42) - torch.manual_seed(42) - - fn() - - return wrapped_fn + # Ensure each worker process has the same random seed + random.seed(42) + torch.manual_seed(42) def create_expert_indices_with_redundancy( @@ -275,6 +269,79 @@ def verify_redundant_experts_have_same_weights( ) +def _test_rearrange_expert_weights_with_redundancy( + env, world_size, num_layers, num_local_experts, num_logical_experts +) -> None: + # Initialize model parallel (using tensor parallel as an entrypoint + # to expert parallel) + set_env_vars_and_device(env) + ensure_model_parallel_initialized( + tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 + ) + + ep_group = get_tp_group().cpu_group + ep_rank = torch.distributed.get_rank() + device = torch.device(f"cuda:{ep_rank}") + + # Test parameters + total_physical_experts = world_size * num_local_experts + hidden_sizes = [32, 64] # Two different weight matrices + + # Create old expert indices (with redundancy) + redundancy_config = create_redundancy_config( + num_logical_experts, total_physical_experts + ) + + old_indices = create_expert_indices_with_redundancy( + num_layers, + num_logical_experts, + total_physical_experts, + redundancy_config, + ) + + # Create new expert indices (with redundancy) + new_redundancy_config = create_redundancy_config( + num_logical_experts, total_physical_experts + ) + new_indices = create_expert_indices_with_redundancy( + num_layers, + num_logical_experts, + total_physical_experts, + new_redundancy_config, + ) + + # Create expert weights + expert_weights = create_expert_weights( + num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices + ) + + # Execute weight rearrangement + rearrange_expert_weights_inplace( + old_indices, + new_indices, + expert_weights, + ep_group, + is_profile=False, + ) + + # Verify the rearrangement result + verify_expert_weights_after_shuffle( + expert_weights, + new_indices, + hidden_sizes, + ep_rank, + num_local_experts, + ) + + verify_redundant_experts_have_same_weights( + expert_weights, + new_indices, + hidden_sizes, + world_size, + num_local_experts, + ) + + @pytest.mark.parametrize( "world_size,num_layers,num_local_experts,num_logical_experts", [ @@ -305,78 +372,69 @@ def test_rearrange_expert_weights_with_redundancy( if torch.cuda.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") + distributed_run( + _test_rearrange_expert_weights_with_redundancy, + world_size, + num_layers, + num_local_experts, + num_logical_experts, + ) - @worker_fn_wrapper - def worker_fn(): - # Initialize model parallel (using tensor parallel as an entrypoint - # to expert parallel) - ensure_model_parallel_initialized( - tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 - ) - ep_group = get_tp_group().cpu_group - ep_rank = torch.distributed.get_rank() - device = torch.device(f"cuda:{ep_rank}") +def _test_rearrange_expert_weights_no_change(env, world_size) -> None: + set_env_vars_and_device(env) + ensure_model_parallel_initialized( + tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 + ) - # Test parameters - total_physical_experts = world_size * num_local_experts - hidden_sizes = [32, 64] # Two different weight matrices + ep_group = get_tp_group().cpu_group + ep_rank = torch.distributed.get_rank() + device = torch.device(f"cuda:{ep_rank}") - # Create old expert indices (with redundancy) - redundancy_config = create_redundancy_config( - num_logical_experts, total_physical_experts - ) + num_layers = 2 + num_local_experts = 2 + total_physical_experts = world_size * num_local_experts + num_logical_experts = total_physical_experts // 2 # Some redundancy + hidden_sizes = [32, 64] - old_indices = create_expert_indices_with_redundancy( - num_layers, - num_logical_experts, - total_physical_experts, - redundancy_config, - ) + # Create redundancy configuration + redundancy_config = [2] * num_logical_experts - # Create new expert indices (with redundancy) - new_redundancy_config = create_redundancy_config( - num_logical_experts, total_physical_experts - ) - new_indices = create_expert_indices_with_redundancy( - num_layers, - num_logical_experts, - total_physical_experts, - new_redundancy_config, - ) + # Same indices - no change + indices = create_expert_indices_with_redundancy( + num_layers, num_logical_experts, total_physical_experts, redundancy_config + ) - # Create expert weights - expert_weights = create_expert_weights( - num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices - ) + expert_weights = create_expert_weights( + num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices + ) - # Execute weight rearrangement - rearrange_expert_weights_inplace( - old_indices, - new_indices, - expert_weights, - ep_group, - is_profile=False, - ) + # Save original weights + original_weights = [] + for layer_weights in expert_weights: + layer_copy = [] + for weight in layer_weights: + layer_copy.append(weight.clone()) + original_weights.append(layer_copy) - # Verify the rearrangement result - verify_expert_weights_after_shuffle( - expert_weights, - new_indices, - hidden_sizes, - ep_rank, - num_local_experts, - ) + # Execute rearrangement (should be no change) + rearrange_expert_weights_inplace( + indices, + indices, # Same indices + expert_weights, + ep_group, + is_profile=False, + ) - verify_redundant_experts_have_same_weights( - expert_weights, - new_indices, - hidden_sizes, - world_size, - num_local_experts, - ) - - distributed_run(worker_fn, world_size) + # Verify that the weights have not changed + for layer in range(num_layers): + for weight_idx in range(len(hidden_sizes)): + torch.testing.assert_close( + expert_weights[layer][weight_idx], + original_weights[layer][weight_idx], + msg=f"""Layer {layer}, weight {weight_idx} + should remain unchanged""", + ) @pytest.mark.parametrize("world_size", [2, 4]) @@ -388,62 +446,69 @@ def test_rearrange_expert_weights_no_change(world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") + distributed_run(_test_rearrange_expert_weights_no_change, world_size) - @worker_fn_wrapper - def worker_fn(): - ensure_model_parallel_initialized( - tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 - ) - ep_group = get_tp_group().cpu_group - ep_rank = torch.distributed.get_rank() - device = torch.device(f"cuda:{ep_rank}") +def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None: + set_env_vars_and_device(env) + ensure_model_parallel_initialized( + tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 + ) - num_layers = 2 - num_local_experts = 2 - total_physical_experts = world_size * num_local_experts - num_logical_experts = total_physical_experts // 2 # Some redundancy - hidden_sizes = [32, 64] + ep_group = get_tp_group().cpu_group + ep_rank = torch.distributed.get_rank() + device = torch.device(f"cuda:{ep_rank}") - # Create redundancy configuration - redundancy_config = [2] * num_logical_experts + num_layers = 1 + num_local_experts = 2 + total_physical_experts = world_size * num_local_experts + num_logical_experts = total_physical_experts // 2 + hidden_sizes = [32] - # Same indices - no change - indices = create_expert_indices_with_redundancy( - num_layers, num_logical_experts, total_physical_experts, redundancy_config - ) + # Create different index distributions + old_redundancy = create_redundancy_config( + num_logical_experts, total_physical_experts + ) + new_redundancy = create_redundancy_config( + num_logical_experts, total_physical_experts + ) - expert_weights = create_expert_weights( - num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices - ) + old_indices = create_expert_indices_with_redundancy( + num_layers, num_logical_experts, total_physical_experts, old_redundancy + ) + new_indices = create_expert_indices_with_redundancy( + num_layers, num_logical_experts, total_physical_experts, new_redundancy + ) - # Save original weights - original_weights = [] - for layer_weights in expert_weights: - layer_copy = [] - for weight in layer_weights: - layer_copy.append(weight.clone()) - original_weights.append(layer_copy) + expert_weights = create_expert_weights( + num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices + ) - # Execute rearrangement (should be no change) - rearrange_expert_weights_inplace( - indices, - indices, # Same indices - expert_weights, - ep_group, - is_profile=False, - ) + # Save original weights + original_weights = [] + for layer_weights in expert_weights: + layer_copy = [] + for weight in layer_weights: + layer_copy.append(weight.clone()) + original_weights.append(layer_copy) - # Verify that the weights have not changed - for layer in range(num_layers): - for weight_idx in range(len(hidden_sizes)): - torch.testing.assert_close( - expert_weights[layer][weight_idx], - original_weights[layer][weight_idx], - msg=f"Layer {layer}, weight {weight_idx} should remain unchanged", - ) + # Execute profile mode rearrangement + rearrange_expert_weights_inplace( + old_indices, + new_indices, + expert_weights, + ep_group, + is_profile=True, # Profile mode + ) - distributed_run(worker_fn, world_size) + # In profile mode, the weights should remain unchanged + for layer in range(num_layers): + for weight_idx in range(len(hidden_sizes)): + torch.testing.assert_close( + expert_weights[layer][weight_idx], + original_weights[layer][weight_idx], + msg="In profile mode, the weights should remain unchanged", + ) @pytest.mark.parametrize("world_size", [2, 4]) @@ -452,66 +517,4 @@ def test_rearrange_expert_weights_profile_mode(world_size): if torch.cuda.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") - - @worker_fn_wrapper - def worker_fn(): - ensure_model_parallel_initialized( - tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 - ) - - ep_group = get_tp_group().cpu_group - ep_rank = torch.distributed.get_rank() - device = torch.device(f"cuda:{ep_rank}") - - num_layers = 1 - num_local_experts = 2 - total_physical_experts = world_size * num_local_experts - num_logical_experts = total_physical_experts // 2 - hidden_sizes = [32] - - # Create different index distributions - old_redundancy = create_redundancy_config( - num_logical_experts, total_physical_experts - ) - new_redundancy = create_redundancy_config( - num_logical_experts, total_physical_experts - ) - - old_indices = create_expert_indices_with_redundancy( - num_layers, num_logical_experts, total_physical_experts, old_redundancy - ) - new_indices = create_expert_indices_with_redundancy( - num_layers, num_logical_experts, total_physical_experts, new_redundancy - ) - - expert_weights = create_expert_weights( - num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices - ) - - # Save original weights - original_weights = [] - for layer_weights in expert_weights: - layer_copy = [] - for weight in layer_weights: - layer_copy.append(weight.clone()) - original_weights.append(layer_copy) - - # Execute profile mode rearrangement - rearrange_expert_weights_inplace( - old_indices, - new_indices, - expert_weights, - ep_group, - is_profile=True, # Profile mode - ) - - # In profile mode, the weights should remain unchanged - for layer in range(num_layers): - for weight_idx in range(len(hidden_sizes)): - torch.testing.assert_close( - expert_weights[layer][weight_idx], - original_weights[layer][weight_idx], - msg="In profile mode, the weights should remain unchanged", - ) - - distributed_run(worker_fn, world_size) + distributed_run(_test_rearrange_expert_weights_profile_mode, world_size) From 2c52c7fd9a480f96ac93e63eccf9a3ee01686ad4 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 20 Nov 2025 03:52:23 -0500 Subject: [PATCH 060/249] [Bug] Fix torch dynamo warning Dynamo detected a call to a `functools.lru_cache` (#29038) Signed-off-by: yewentao256 --- tests/v1/determinism/conftest.py | 5 +-- tests/v1/determinism/test_batch_invariance.py | 35 +++++-------------- .../test_online_batch_invariance.py | 12 +++++-- tests/v1/determinism/utils.py | 20 +++++++++++ vllm/model_executor/layers/batch_invariant.py | 20 ++++++----- 5 files changed, 52 insertions(+), 40 deletions(-) diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py index 3c2136e005849..bde02bbd0d5c6 100644 --- a/tests/v1/determinism/conftest.py +++ b/tests/v1/determinism/conftest.py @@ -1,11 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import pytest +import vllm.model_executor.layers.batch_invariant as batch_invariant + @pytest.fixture(autouse=True) def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch): """Automatically enable batch invariant kernel overrides for all tests.""" + monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", True) monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1") - yield diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index d4e88891512c4..74ae5e182da78 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -6,29 +6,16 @@ import random import pytest import torch -from utils import _extract_step_logprobs, _random_prompt, skip_unsupported +from utils import ( + BACKENDS, + _extract_step_logprobs, + _random_prompt, + resolve_model_name, + skip_unsupported, +) +import vllm.model_executor.layers.batch_invariant as batch_invariant from vllm import LLM, SamplingParams -from vllm.platforms import current_platform - -BACKENDS: list[str] = [ - "FLASH_ATTN", - "FLASHINFER", -] - -if current_platform.is_cuda() and current_platform.is_device_capability(90): - BACKENDS.append("FLASH_ATTN_MLA") - -DEFAULT_MODEL = "Qwen/Qwen3-1.7B" -MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat" - - -def resolve_model_name(backend: str) -> str: - """Resolve the model name for the given backend, respecting env overrides.""" - model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL) - if backend.endswith("MLA") and model == DEFAULT_MODEL: - return MLA_MODEL - return model @skip_unsupported @@ -454,14 +441,10 @@ def test_logprobs_without_batch_invariance_should_fail( The test will PASS if we detect differences (proving batch invariance matters). The test will FAIL if everything matches (suggesting batch invariance isn't needed). """ - from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant - - vllm_is_batch_invariant.cache_clear() monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # CRITICAL: Disable batch invariance for this test - monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0") - + monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False) seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) model_name = resolve_model_name(backend) diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py index 23f47863dd23f..d74b435797f8f 100644 --- a/tests/v1/determinism/test_online_batch_invariance.py +++ b/tests/v1/determinism/test_online_batch_invariance.py @@ -16,7 +16,8 @@ import sys from typing import Any import openai -from utils import _random_prompt, skip_unsupported +import pytest +from utils import BACKENDS, _random_prompt, resolve_model_name, skip_unsupported from tests.utils import RemoteOpenAIServer @@ -133,9 +134,14 @@ def _compare_bs1_vs_bsn_single_process( @skip_unsupported -def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(): +@pytest.mark.parametrize("backend", BACKENDS) +def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( + backend: str, monkeypatch: pytest.MonkeyPatch +) -> None: random.seed(int(os.getenv("VLLM_TEST_SEED", "12345"))) - model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B") + # Override backend for this test (and the RemoteOpenAIServer child process). + monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) + model_name = resolve_model_name(backend) prompts_all = [_random_prompt(10, 50) for _ in range(32)] sp_kwargs: dict[str, Any] = { diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index 5141837faea04..7ee442551e2c3 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os import random import pytest @@ -12,6 +13,25 @@ skip_unsupported = pytest.mark.skipif( reason="Requires CUDA and >= Hopper (SM90)", ) +BACKENDS: list[str] = [ + "FLASH_ATTN", + "FLASHINFER", +] + +if current_platform.is_cuda() and current_platform.is_device_capability(90): + BACKENDS.append("FLASH_ATTN_MLA") + +DEFAULT_MODEL = "Qwen/Qwen3-1.7B" +MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat" + + +def resolve_model_name(backend: str) -> str: + """Resolve the model name for the given backend.""" + model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL) + if backend.endswith("MLA") and model == DEFAULT_MODEL: + return MLA_MODEL + return model + def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str: # Generate more realistic prompts that will actually produce varied tokens diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 5dbeb29174349..69fa6bdffd43f 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Callable -from functools import cache from typing import Any import torch @@ -785,16 +784,19 @@ def enable_batch_invariant_mode(): torch.backends.cuda.preferred_blas_library(backend="cublaslt") -@cache -def vllm_is_batch_invariant(): - env_key = "VLLM_BATCH_INVARIANT" - is_overridden = False - val = os.getenv(env_key, "0") +def _read_vllm_batch_invariant() -> bool: + val = os.getenv("VLLM_BATCH_INVARIANT", "0") try: - is_overridden = int(val) != 0 + return int(val) != 0 except ValueError: - is_overridden = False - return is_overridden + return False + + +VLLM_BATCH_INVARIANT: bool = _read_vllm_batch_invariant() + + +def vllm_is_batch_invariant() -> bool: + return VLLM_BATCH_INVARIANT def override_envs_for_invariance(): From 322cb02872d806afcaaa7d0aac3fad7f304b7888 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 03:48:09 -0600 Subject: [PATCH 061/249] [CI/Build][AMD] Fix import errors in tests/kernels/attention (#29032) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- .../attention/test_cascade_flash_attn.py | 18 +++++++++++++----- tests/kernels/attention/test_flash_attn.py | 19 ++++++++++++++----- tests/kernels/attention/test_flashinfer.py | 12 ++++++++++-- .../attention/test_flashinfer_mla_decode.py | 3 ++- .../test_flashinfer_trtllm_attention.py | 3 ++- tests/kernels/moe/test_flashinfer.py | 9 ++++++++- 6 files changed, 49 insertions(+), 15 deletions(-) diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index 20f573821b25f..d86041d71febd 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -7,11 +7,19 @@ import torch from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states -from vllm.vllm_flash_attn import ( - fa_version_unsupported_reason, - flash_attn_varlen_func, - is_fa_version_supported, -) + +try: + from vllm.vllm_flash_attn import ( + fa_version_unsupported_reason, + flash_attn_varlen_func, + is_fa_version_supported, + ) +except ImportError: + if current_platform.is_rocm(): + pytest.skip( + "vllm_flash_attn is not supported for vLLM on ROCm.", + allow_module_level=True, + ) NUM_HEADS = [(4, 4), (8, 2), (16, 2)] HEAD_SIZES = [128, 192, 256] diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index 26b8c77ab482f..bbd5df5419f80 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -6,11 +6,20 @@ import pytest import torch from vllm.platforms import current_platform -from vllm.vllm_flash_attn import ( - fa_version_unsupported_reason, - flash_attn_varlen_func, - is_fa_version_supported, -) + +try: + from vllm.vllm_flash_attn import ( + fa_version_unsupported_reason, + flash_attn_varlen_func, + is_fa_version_supported, + ) +except ImportError: + if current_platform.is_rocm(): + pytest.skip( + "vllm_flash_attn is not supported for vLLM on ROCm.", + allow_module_level=True, + ) + NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [40, 72, 80, 128, 256] diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 82ec2ef14e56c..eedeec33e0d45 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -2,12 +2,20 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import flashinfer import pytest -import torch from vllm.platforms import current_platform +try: + import flashinfer +except ImportError: + if current_platform.is_rocm(): + pytest.skip( + "flashinfer is not supported for vLLM on ROCm.", allow_module_level=True + ) + +import torch + NUM_HEADS = [(32, 8), (6, 1)] HEAD_SIZES = [128, 256] BLOCK_SIZES = [16, 32] diff --git a/tests/kernels/attention/test_flashinfer_mla_decode.py b/tests/kernels/attention/test_flashinfer_mla_decode.py index 0350136677c6b..d183f67d3919e 100644 --- a/tests/kernels/attention/test_flashinfer_mla_decode.py +++ b/tests/kernels/attention/test_flashinfer_mla_decode.py @@ -3,7 +3,6 @@ import pytest import torch import torch.nn.functional as F -from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla from torch import Tensor from vllm.platforms import current_platform @@ -15,6 +14,8 @@ if not current_platform.has_device_capability(100): reason="FlashInfer MLA Requires compute capability of 10 or above.", allow_module_level=True, ) +else: + from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla def ref_mla( diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 693b849ebc5d7..98ea40608b468 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import flashinfer import pytest import torch @@ -16,6 +15,8 @@ if not current_platform.is_device_capability(100): pytest.skip( "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True ) +else: + import flashinfer FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 FP8_DTYPE = current_platform.fp8_dtype() diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 218df4a2632c3..638741e91619b 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -22,7 +22,14 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8 from vllm.model_executor.models.llama4 import Llama4MoE from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe + +try: + from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +except ImportError: + if current_platform.is_rocm(): + pytest.skip( + "flashinfer not supported for vLLM on ROCm", allow_module_level=True + ) if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability( 90 From a903d59ffaffd9160c517fa337b3ab0265a898c3 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Thu, 20 Nov 2025 02:51:36 -0800 Subject: [PATCH 062/249] cleanup at::Tag::needs_fixed_stride_order (#28974) Signed-off-by: Boyuan Feng Co-authored-by: Cyrus Leung --- csrc/cpu/torch_bindings.cpp | 7 ++-- csrc/torch_bindings.cpp | 64 +++++++++++-------------------------- 2 files changed, 20 insertions(+), 51 deletions(-) diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index b07d20bab7dd9..e0e3ef71b485f 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -172,7 +172,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantization #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \ defined(__powerpc64__) - at::Tag stride_tag = at::Tag::needs_fixed_stride_order; // Helper function to release oneDNN handlers ops.def("release_dnnl_matmul_handler(int handler) -> ()", &release_dnnl_matmul_handler); @@ -208,15 +207,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Compute int8 quantized tensor for given scaling factor. ops.def( "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," - "Tensor? azp) -> ()", - {stride_tag}); + "Tensor? azp) -> ()"); ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant); // Compute int8 quantized tensor and scaling factor ops.def( "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " - "Tensor!? azp) -> ()", - {stride_tag}); + "Tensor!? azp) -> ()"); ops.impl("dynamic_scaled_int8_quant", torch::kCPU, &dynamic_scaled_int8_quant); #endif diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index c3ae06a30e3e8..5af74c2c2a6b0 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -20,18 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops // - // The default behavior in PyTorch 2.6 was changed to "requires_contiguous", - // so we need - // to override this for many GEMMs with the following tag. Otherwise, - // torch.compile will force all input tensors to be contiguous(), which - // will break many custom ops that require column-major weight matrices. - // This was a bug and PyTorch 2.7 has since fixed this. -#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6 - #define stride_tag at::Tag::needs_fixed_stride_order -#else - #define stride_tag -#endif - ops.def( "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! " "y_q, Tensor! y_s," @@ -241,15 +229,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantized GEMM for AWQ. ops.def( "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " - "Tensor _zeros, SymInt split_k_iters) -> Tensor", - {stride_tag}); + "Tensor _zeros, SymInt split_k_iters) -> Tensor"); ops.impl("awq_gemm", torch::kCUDA, &awq_gemm); // Dequantization for AWQ. ops.def( "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, " - "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor", - {stride_tag}); + "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor"); ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize); // Note about marlin kernel 'workspace' arguments: @@ -271,8 +257,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " "Tensor b_scales, Tensor workspace, " "int b_q_type, " - "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor", - {stride_tag}); + "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor"); // conditionally compiled so impl in source file // Machete (Dense) Optimized Mixed Precision GEMM for Hopper. @@ -298,8 +283,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor? channel_scales," " Tensor? token_scales," " str? schedule" - ") -> Tensor", - {stride_tag}); + ") -> Tensor"); ops.def( "machete_prepack_B(" " Tensor B," @@ -319,8 +303,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? " "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " - "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor", - {stride_tag}); + "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor"); // conditionally compiled so impl registration is in source file // gptq_marlin repack from GPTQ. @@ -346,8 +329,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor token_scales," " ScalarType? out_type," " str? maybe_schedule" - ") -> Tensor", - {stride_tag}); + ") -> Tensor"); // pack scales ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor"); // encode and reorder weight matrix @@ -394,24 +376,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b," " Tensor block_scale_a, Tensor block_scale_b," - " Tensor alpha) -> ()", - {stride_tag}); + " Tensor alpha) -> ()"); ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm); // cutlass blockwise scaledgroup GEMM ops.def( "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, " "Tensor scales_a, Tensor scales_b, " - "Tensor problem_sizes, Tensor expert_offsets) -> ()", - {stride_tag}); + "Tensor problem_sizes, Tensor expert_offsets) -> ()"); // conditionally compiled so impl registration is in source file // cutlass nvfp4 block scaled group GEMM ops.def( "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b," " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas," - " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()", - {stride_tag}); + " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()"); // conditionally compiled so impl registration is in source file // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column @@ -419,8 +398,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "cutlass_scaled_mm(Tensor! out, Tensor a," " Tensor b, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); + " Tensor b_scales, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm); // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column @@ -429,8 +407,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "cutlass_scaled_mm_azp(Tensor! out, Tensor a," " Tensor b, Tensor a_scales," " Tensor b_scales, Tensor azp_adj," - " Tensor? azp, Tensor? bias) -> ()", - {stride_tag}); + " Tensor? azp, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp); // Check if cutlass scaled_mm is supported for CUDA devices of the given @@ -449,8 +426,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor a_scales, Tensor b_scales, Tensor expert_offsets, " " Tensor problem_sizes, Tensor a_strides, " " Tensor b_strides, Tensor c_strides, bool per_act_token, " - " bool per_out_ch) -> ()", - {stride_tag}); + " bool per_out_ch) -> ()"); ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm); // A function that computes data required to run fused MoE with w8a8 grouped @@ -464,8 +440,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes1, Tensor! problem_sizes2, " " Tensor! input_permutation, " " Tensor! output_permutation, int num_experts, " - " int n, int k, Tensor? blockscale_offsets) -> ()", - {stride_tag}); + " int n, int k, Tensor? blockscale_offsets) -> " + "()"); ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); // A function that computes problem sizes for each expert's multiplication @@ -476,8 +452,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes1, " " Tensor! problem_sizes2, " " int num_experts, int n, int k, " - " Tensor? blockscale_offsets) -> ()", - {stride_tag}); + " Tensor? blockscale_offsets) -> ()"); ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, &get_cutlass_moe_mm_problem_sizes); @@ -492,8 +467,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes2, " " Tensor expert_num_tokens, " " int num_local_experts, int padded_m, " - " int n, int k) -> ()", - {stride_tag}); + " int n, int k) -> ()"); ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA, &get_cutlass_pplx_moe_mm_data); @@ -517,8 +491,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "cutlass_scaled_sparse_mm(Tensor! out, Tensor a," " Tensor bt_nzs," " Tensor bt_meta, Tensor a_scales," - " Tensor b_scales, Tensor? bias) -> ()", - {stride_tag}); + " Tensor b_scales, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm); // CUTLASS sparse matrix compressor @@ -567,8 +540,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, " "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool " "use_v2_format, int bit) " - "-> Tensor", - {stride_tag}); + "-> Tensor"); ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm); // Post processing for GPTQ. From fb8851f25485c3c94b0a71b77ff800f55ba328cf Mon Sep 17 00:00:00 2001 From: Vensen Date: Thu, 20 Nov 2025 18:52:02 +0800 Subject: [PATCH 063/249] [Bugfix][cache_kernels]: Fix OOB in cache_kernels.cu (#28760) Signed-off-by: vensen Signed-off-by: Vensenmu --- csrc/cache_kernels.cu | 19 +++++---- tests/kernels/test_cache_kernels.py | 65 +++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 tests/kernels/test_cache_kernels.py diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 0aa0dc14c7480..a6c953ee0eac9 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -965,7 +965,9 @@ __global__ void gather_and_maybe_dequant_cache( } }; - for (int pid = split_start; pid < full_blocks_end; ++pid) { + const auto loop_end = + std::min((int64_t)full_blocks_end, block_table_stride - offset); + for (int pid = split_start; pid < loop_end; ++pid) { auto block_id = batch_block_table[pid]; auto block_start_ptr = src_cache + block_id * cache_block_stride; auto block_dst_ptr = dst + pid * block_size * dst_entry_stride; @@ -976,12 +978,15 @@ __global__ void gather_and_maybe_dequant_cache( } if (partial_block_size) { - auto block_id = batch_block_table[full_blocks_end]; - auto block_start_ptr = src_cache + block_id * cache_block_stride; - auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride; - for (int eid = 0; eid < partial_block_size; ++eid) { - copy_entry(block_start_ptr + eid * cache_entry_stride, - block_dst_ptr + eid * dst_entry_stride); + if (offset + full_blocks_end < block_table_stride) { + auto block_id = batch_block_table[full_blocks_end]; + auto block_start_ptr = src_cache + block_id * cache_block_stride; + auto block_dst_ptr = + dst + full_blocks_end * block_size * dst_entry_stride; + for (int eid = 0; eid < partial_block_size; ++eid) { + copy_entry(block_start_ptr + eid * cache_entry_stride, + block_dst_ptr + eid * dst_entry_stride); + } } } } diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py new file mode 100644 index 0000000000000..b5d66b4ede886 --- /dev/null +++ b/tests/kernels/test_cache_kernels.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for CUDA kernels in cache_kernels.cu.""" + +import pytest +import torch + +try: + from vllm import _custom_ops as ops +except ImportError: + pytest.skip( + "Could not import vllm._custom_ops. (pip install -e .)", allow_module_level=True + ) + + +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device") +def test_gather_cache_oob(): + """ + Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909). + This test constructs a boundary case identified in the issue where + seq_starts causes the block_table offset to read out of bounds. + """ + + batch_size = 1 + block_size = 64 + entry_size = 128 + + block_table = torch.tensor([[1, 2]], dtype=torch.int32, device="cuda") + + # This will result in offset = 128 / block_size = 128 / 64 = 2 + # This will cause the kernel to try to read from + # block_table[0, 2], but its size is only 2. + seq_starts = torch.tensor([128], dtype=torch.int32, device="cuda") + + seq_len = 65 + cu_seq_lens = torch.tensor([0, seq_len], dtype=torch.int32, device="cuda") + + # src_cache: [num_blocks, block_size, entry_size] + num_blocks = 5 + src_cache = torch.randn( + (num_blocks, block_size, entry_size), dtype=torch.float16, device="cuda" + ) + + dst = torch.empty((seq_len, entry_size), dtype=torch.float16, device="cuda") + + scale = torch.tensor([1.0], dtype=torch.float32, device="cuda") + + # Calling the C++ function gather_and_maybe_dequant_cache + ops.gather_and_maybe_dequant_cache( + src_cache, + dst, + block_table, + cu_seq_lens, + batch_size, + "auto", # kv_cache_dtype + scale, + seq_starts, + ) + + torch.cuda.synchronize() + assert True + + +if __name__ == "__main__": + pytest.main([__file__]) From dc45efc8ef7fc1e2571331eaf4671e1652e2a865 Mon Sep 17 00:00:00 2001 From: Dezhan Date: Thu, 20 Nov 2025 02:52:36 -0800 Subject: [PATCH 064/249] [BugFix] Fix Llama4 Pipeline Parallelism Assert Error (#28577) Co-authored-by: Dezhan Tu --- vllm/model_executor/models/llama4.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 4c6d1d4244755..e1bdfc3405f70 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -53,6 +53,7 @@ from vllm.model_executor.models.utils import sequence_parallel_chunk from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel from .utils import ( AutoWeightsLoader, + PPMissingLayer, extract_layer_index, fast_topk, is_pp_missing_parameter, @@ -729,6 +730,9 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts): self.moe_layers = [] example_moe = None for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + assert isinstance(layer, Llama4DecoderLayer) if isinstance(layer.feed_forward, Llama4MoE): # Pick last one layer since the first ones may be dense layers. @@ -765,6 +769,9 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts): self.num_local_physical_experts = num_local_physical_experts self.num_redundant_experts = num_physical_experts - self.num_logical_experts for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + if isinstance(layer.feed_forward, Llama4MoE): moe = layer.feed_forward moe.n_local_physical_experts = num_local_physical_experts From edfe867208482ccadbf0ef503fc43e1fbb1e48f6 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Thu, 20 Nov 2025 18:52:53 +0800 Subject: [PATCH 065/249] [Misc] don't cache `CUTLASS_REVISION` var in CMakeLists.txt (#28518) Signed-off-by: Jinzhen Lin Co-authored-by: Lucas Wilkinson --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae8e6175443f3..a4cf51d17e982 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") + set(CUTLASS_REVISION "v4.2.1") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) From 66483a9d00e4e26647dd26b4c49f6eca73972b8c Mon Sep 17 00:00:00 2001 From: cjackal <44624812+cjackal@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:53:09 +0900 Subject: [PATCH 066/249] [Chore] Update `xgrammar` version from 0.1.25 to 0.1.27 (#28221) Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com> --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 1058ab91a02a5..f2d1c0762ef6a 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -24,7 +24,7 @@ outlines_core == 0.2.11 # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 -xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" +xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs From 6eb745d9bdf5b69bb63f897b32465c62ecb9e14a Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Thu, 20 Nov 2025 02:53:50 -0800 Subject: [PATCH 067/249] Add truncate arg to yarn to match openai implementation of gpt-oss (#28244) Signed-off-by: ashors1 Co-authored-by: Chen Zhang --- .../layers/rotary_embedding/__init__.py | 1 + .../layers/rotary_embedding/common.py | 14 +++++++------- .../layers/rotary_embedding/yarn_scaling_rope.py | 3 +++ vllm/model_executor/models/gpt_oss.py | 1 + 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index ae8a7d93b50e4..152d9401b8e94 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -197,6 +197,7 @@ def get_rope( "beta_fast", "beta_slow", "apply_yarn_scaling", + "truncate", ) } if "mrope_section" in rope_parameters: diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index 196533b617959..13f8d15cc0f72 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -117,13 +117,13 @@ def yarn_find_correction_range( dim: int, base: float = 10000, max_position_embeddings: int = 2048, -) -> tuple[int, int]: - low = math.floor( - yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) - ) - high = math.ceil( - yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) - ) + truncate: bool = True, +) -> tuple[float | int, float | int]: + low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings) + high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings) + if truncate: + low = math.floor(low) + high = math.ceil(high) return max(low, 0), min(high, dim - 1) # Clamp values just in case diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py index ff46ad74b302e..f01ca1e231211 100644 --- a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py @@ -28,12 +28,14 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding): beta_fast: int = 32, beta_slow: int = 1, apply_yarn_scaling: bool = True, + truncate: bool = True, ) -> None: self.scaling_factor = scaling_factor self.extrapolation_factor = extrapolation_factor self.attn_factor = attn_factor self.beta_fast = beta_fast self.beta_slow = beta_slow + self.truncate = truncate # Get n-d magnitude scaling corrected for interpolation self.mscale = ( float(yarn_get_mscale(self.scaling_factor) * attn_factor) @@ -57,6 +59,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding): self.rotary_dim, self.base, self.max_position_embeddings, + self.truncate, ) # Get n-d rotational scaling corrected for extrapolation inv_freq_mask = ( diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 25048330f7974..8835acb8ec65c 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -78,6 +78,7 @@ class OAIAttention(nn.Module): ], "beta_fast": config.rope_parameters["beta_fast"], "beta_slow": config.rope_parameters["beta_slow"], + "truncate": config.rope_parameters.get("truncate", True), }, is_neox_style=True, ) From 06c20c9904644d8f65523bb747756b2eae706b8e Mon Sep 17 00:00:00 2001 From: Pleaplusone Date: Thu, 20 Nov 2025 18:54:01 +0800 Subject: [PATCH 068/249] [ROCm] Add AMD GPU support on Deepseek v3.2 and SparseMLA (#26670) Signed-off-by: ganyi --- csrc/cache_kernels.cu | 4 + vllm/attention/ops/rocm_aiter_mla_sparse.py | 210 +++++++++++ vllm/model_executor/models/deepseek_v2.py | 22 +- vllm/platforms/rocm.py | 13 +- vllm/utils/deep_gemm.py | 5 +- .../attention/backends/mla/flashmla_sparse.py | 2 +- vllm/v1/attention/backends/mla/indexer.py | 15 +- .../backends/mla/rocm_aiter_mla_sparse.py | 325 ++++++++++++++++++ vllm/v1/worker/utils.py | 2 +- 9 files changed, 583 insertions(+), 15 deletions(-) create mode 100644 vllm/attention/ops/rocm_aiter_mla_sparse.py create mode 100644 vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index a6c953ee0eac9..32960cc8073bb 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel( #ifndef USE_ROCM __syncwarp(); #endif +#if defined(__gfx942__) + float scale = fmaxf(amax, 1e-4) / 224.0f; +#else float scale = fmaxf(amax, 1e-4) / 448.0f; +#endif if (use_ue8m0) { scale = exp2f(ceilf(log2f(scale))); } diff --git a/vllm/attention/ops/rocm_aiter_mla_sparse.py b/vllm/attention/ops/rocm_aiter_mla_sparse.py new file mode 100644 index 0000000000000..080e92ecc9408 --- /dev/null +++ b/vllm/attention/ops/rocm_aiter_mla_sparse.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import importlib +from functools import lru_cache + +import torch + +from vllm._aiter_ops import rocm_aiter_ops +from vllm.logger import init_logger +from vllm.platforms import current_platform + +logger = init_logger(__name__) + + +# Take from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L84 +def fp8_mqa_logits_torch( + q: torch.Tensor, + kv: tuple[torch.Tensor, torch.Tensor], + weights: torch.Tensor, + cu_seqlen_ks: torch.Tensor, + cu_seqlen_ke: torch.Tensor, +) -> torch.Tensor: + """Compute FP8 MQA logits for a single sequence without KV paging. + + Args: + q: Query tensor of shape [M, H, D]. Casted to + `torch.float8_e4m3fn` by caller. + kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with + dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or + [N, 1]) with dtype `torch.float32`. + weights: weights of shape [M, H], dtype `torch.float32`. + cu_seqlen_ks: Start indices (inclusive) for valid K per query position, + shape [M], dtype int32. + cu_seqlen_ke: End indices (exclusive) for valid K per query position, + shape [M], dtype int32. + + Returns: + Logits tensor of shape [M, N], dtype `torch.float32`. + """ + kv, scale = kv + seq_len_kv = kv.shape[0] + k = kv.to(torch.bfloat16) + q = q.to(torch.bfloat16) + + mask_lo = ( + torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None] + ) + mask_hi = ( + torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None] + ) + mask = mask_lo & mask_hi + + score = torch.einsum("mhd,nd->hmn", q, k).float() * scale + logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0) + logits = logits.masked_fill(~mask, float("-inf")) + + return logits + + +def rocm_fp8_mqa_logits( + q: torch.Tensor, + kv: tuple[torch.Tensor, torch.Tensor], + weights: torch.Tensor, + cu_seqlen_ks: torch.Tensor, + cu_seqlen_ke: torch.Tensor, +) -> torch.Tensor: + """Compute FP8 MQA logits for a single sequence without KV paging. + + Args: + q: Query tensor of shape [M, H, D]. Casted to + `torch.float8_e4m3fn` by caller. + kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with + dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or + [N, 1]) with dtype `torch.float32`. + weights: weights of shape [M, H], dtype `torch.float32`. + cu_seqlen_ks: Start indices (inclusive) for valid K per query position, + shape [M], dtype int32. + cu_seqlen_ke: End indices (exclusive) for valid K per query position, + shape [M], dtype int32. + + Returns: + Logits tensor of shape [M, N], dtype `torch.float32`. + """ + + # TODO(ganyi): Temporarily workaround, will remove the module check and reference + # path after aiter merge this kernel into main + @lru_cache + def has_mqa_logits_module(): + return importlib.util.find_spec("aiter.ops.triton.fp8_mqa_logits") is not None + + if rocm_aiter_ops.is_enabled() and has_mqa_logits_module(): + from aiter.ops.triton.fp8_mqa_logits import fp8_mqa_logits + + kv, scale = kv + return fp8_mqa_logits(q, kv, scale, weights, cu_seqlen_ks, cu_seqlen_ke) + else: + return fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke) + + +# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L156 +def fp8_paged_mqa_logits_torch( + q: torch.Tensor, + kv_cache: torch.Tensor, + weights: torch.Tensor, + context_lens: torch.Tensor, + block_tables: torch.Tensor, + max_model_len: int, +): + from vllm.utils.math_utils import cdiv + + fp8_dtype = current_platform.fp8_dtype() + batch_size, next_n, _, dim = q.size() + kv_cache, scale = kv_cache[..., :dim], kv_cache[..., dim:] + scale = scale.contiguous().view(torch.float) + q = q.float() + kv_cache = kv_cache.view(fp8_dtype).float() * scale + num_block, block_size, _, dim = kv_cache.size() + logits = torch.full( + [batch_size * next_n, max_model_len], + float("-inf"), + device=q.device, + dtype=torch.float32, + ) + context_lens = context_lens.tolist() + for i in range(batch_size): + context_len = context_lens[i] + q_offsets = torch.arange(context_len - next_n, context_len, device="cuda") + weight_slice = ( + weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous() + ) + for block_rk in range(cdiv(context_len, block_size)): + block_idx = block_tables[i][block_rk] + qx, kx = q[i], kv_cache[block_idx] + k_offsets = torch.arange( + block_rk * block_size, (block_rk + 1) * block_size, device="cuda" + ) + mask = (k_offsets[None, :] < context_len) & ( + k_offsets[None, :] <= q_offsets[:, None] + ) + s = torch.where( + mask[None, :, :], + (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to( + logits.dtype + ), + float("-inf"), + ) + s = torch.relu(s) * weight_slice[..., None] + s = s.sum(dim=0) + logits[ + i * next_n : (i + 1) * next_n, + block_rk * block_size : (block_rk + 1) * block_size, + ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s, float("-inf")) + return logits + + +def rocm_fp8_paged_mqa_logits( + q_fp8: torch.Tensor, + kv_cache_fp8: torch.Tensor, + weights: torch.Tensor, + context_lens: torch.Tensor, + block_tables: torch.Tensor, + schedule_metadata: torch.Tensor, + max_model_len: int, +) -> torch.Tensor: + """Compute FP8 MQA logits using paged KV-cache. + + Args: + q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to + `torch.float8_e4m3fn` by caller. + kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape + [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last + 4 bytes per (block,pos) store the `float` dequant scale. + weights: Tensor of shape [B * next_n, H], dtype `torch.float32`. + context_lens: Tensor of shape [B], dtype int32; effective context length + for each batch element. + block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical + block indices to physical blocks in the paged cache. + schedule_metadata: Returned by `get_paged_mqa_logits_metadata`; + used to distribute work across SMs. + max_model_len: Maximum sequence length used to size the logits output. + + Returns: + Logits tensor of shape [B * next_n, max_model_len], dtype + `torch.float32`. + """ + + if rocm_aiter_ops.is_enabled(): + from aiter.ops.triton.pa_mqa_logits import deepgemm_fp8_paged_mqa_logits_stage1 + + batch_size, next_n, heads, _ = q_fp8.shape + out_qk = torch.full( + (heads, batch_size * next_n, max_model_len), + float("-inf"), + device="cuda", + dtype=torch.float32, + ) + deepgemm_fp8_paged_mqa_logits_stage1( + q_fp8, + kv_cache_fp8, + weights, + out_qk, + context_lens, + block_tables, + max_model_len, + ) + return out_qk.sum(dim=0) + else: + return fp8_paged_mqa_logits_torch( + q_fp8, kv_cache_fp8, weights, context_lens, block_tables, max_model_len + ) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d0a116b97997a..7cfd381592b49 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -594,6 +594,7 @@ def sparse_attn_indexer( ) -> torch.Tensor: # careful! this will be None in dummy run attn_metadata = get_forward_context().attn_metadata + fp8_dtype = current_platform.fp8_dtype() # assert isinstance(attn_metadata, dict) if not isinstance(attn_metadata, dict): return sparse_attn_indexer_fake( @@ -633,7 +634,7 @@ def sparse_attn_indexer( k_fp8 = torch.empty( [chunk.total_seq_lens, head_dim], device=k.device, - dtype=torch.float8_e4m3fn, + dtype=fp8_dtype, ) k_scale = torch.empty( [chunk.total_seq_lens, 4], @@ -647,7 +648,12 @@ def sparse_attn_indexer( chunk.block_table, chunk.cu_seq_lens, ) - logits = fp8_mqa_logits( + fp8_mqa_logits_func = fp8_mqa_logits + if current_platform.is_rocm(): + from vllm.attention.ops.rocm_aiter_mla_sparse import rocm_fp8_mqa_logits + + fp8_mqa_logits_func = rocm_fp8_mqa_logits + logits = fp8_mqa_logits_func( q_fp8[chunk.token_start : chunk.token_end], (k_fp8, k_scale.view(torch.float32)), weights[chunk.token_start : chunk.token_end], @@ -692,7 +698,14 @@ def sparse_attn_indexer( next_n = padded_q_fp8_decode_tokens.shape[1] assert batch_size == decode_metadata.seq_lens.shape[0] num_padded_tokens = batch_size * next_n - logits = fp8_paged_mqa_logits( + fp8_paged_mqa_logits_func = fp8_paged_mqa_logits + if current_platform.is_rocm(): + from vllm.attention.ops.rocm_aiter_mla_sparse import ( + rocm_fp8_paged_mqa_logits, + ) + + fp8_paged_mqa_logits_func = rocm_fp8_paged_mqa_logits + logits = fp8_paged_mqa_logits_func( padded_q_fp8_decode_tokens, kv_cache, weights[:num_padded_tokens], @@ -749,7 +762,8 @@ def sparse_attn_indexer_fake( _flattened_kv = torch.empty( [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8 ) - _k_fp8 = _flattened_kv[..., :head_dim].view(torch.float8_e4m3fn).contiguous() + fp8_dtype = current_platform.fp8_dtype() + _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous() _k_scale = _flattened_kv[..., head_dim:].view(torch.float32).contiguous() return topk_indices_buffer diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index f07f068a9249b..1a2f9226ddce8 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -225,7 +225,18 @@ class RocmPlatform(Platform): from vllm.attention.backends.registry import AttentionBackendEnum if use_sparse: - raise NotImplementedError("Sparse Attention is not supported on ROCm.") + if kv_cache_dtype.startswith("fp8"): + raise ValueError( + "ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype." + ) + assert block_size == 1, ( + "Sparse MLA backend on ROCm only supports block size 1 for now." + ) + logger.info_once("Using Sparse MLA backend on V1 engine.") + return ( + "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse." + "ROCMAiterMLASparseBackend" + ) if use_mla: if selected_backend is None: diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 6b0a383a0e28c..b25c1e3e1ece3 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -325,6 +325,7 @@ DEFAULT_BLOCK_SIZE = [128, 128] def per_block_cast_to_fp8( x: torch.Tensor, block_size: list[int] = DEFAULT_BLOCK_SIZE, use_ue8m0: bool = False ) -> tuple[torch.Tensor, torch.Tensor]: + fp8_dtype = current_platform.fp8_dtype() assert x.dim() == 2 m, n = x.shape block_m, block_n = block_size @@ -334,9 +335,9 @@ def per_block_cast_to_fp8( x_padded[:m, :n] = x x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n) x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - sf = x_amax / 448.0 + sf = x_amax / 224.0 if current_platform.is_fp8_fnuz() else x_amax / 448.0 sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf - x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn) + x_scaled = (x_view * (1.0 / sf)).to(fp8_dtype) return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view( x_view.size(0), x_view.size(2) ) diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index bb8d914d15719..3f2cc8c38327e 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -168,7 +168,7 @@ def _convert_req_index_to_global_index_kernel( inblock_off = tok % BLOCK_SIZE # Guard block_table access - valid_block = block_id < max_num_blocks_per_req + valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0) bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1 base = tl.load(bt_ptr, mask=valid_block, other=0) diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index 37aa5dad89a0e..cc0988435768c 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -11,7 +11,8 @@ from vllm.attention.backends.abstract import ( ) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata +from vllm.platforms import current_platform +from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, is_deep_gemm_supported from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -23,7 +24,9 @@ logger = init_logger(__name__) class DeepseekV32IndexerBackend(AttentionBackend): - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64] + supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [ + 1 if current_platform.is_rocm() else 64 + ] @classmethod def get_supported_head_sizes(cls) -> list[int]: @@ -328,10 +331,10 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder): requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item() seq_lens = common_attn_metadata.seq_lens[:num_decodes] - - self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata( - seq_lens, self.kv_cache_spec.block_size, self.num_sms - ) + if is_deep_gemm_supported(): + self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata( + seq_lens, self.kv_cache_spec.block_size, self.num_sms + ) decode_metadata = DeepSeekV32IndexerDecodeMetadata( block_table=common_attn_metadata.block_table_tensor[:num_decodes, ...], seq_lens=common_attn_metadata.seq_lens[:num_decodes], diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py new file mode 100644 index 0000000000000..c0e7f0e380b98 --- /dev/null +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py @@ -0,0 +1,325 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass +from typing import TYPE_CHECKING, ClassVar, Optional + +import numpy as np +import torch + +from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops +from vllm.attention.backends.abstract import ( + AttentionBackend, + AttentionLayer, + AttentionMetadata, +) +from vllm.attention.backends.utils import get_mla_dims +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import ( + MLACommonBaseImpl, +) +from vllm.v1.attention.backends.mla.flashmla_sparse import ( + triton_convert_req_index_to_global_index, +) +from vllm.v1.attention.backends.utils import ( + AttentionCGSupport, + AttentionMetadataBuilder, + CommonAttentionMetadata, +) +from vllm.v1.kv_cache_interface import AttentionSpec + +if TYPE_CHECKING: + from vllm.model_executor.models.deepseek_v2 import Indexer +logger = init_logger(__name__) + + +class ROCMAiterMLASparseBackend(AttentionBackend): + accept_output_buffer: bool = True + + @staticmethod + def get_name() -> str: + return "ROCM_AITER_MLA_SPARSE" + + @staticmethod + def get_metadata_cls() -> type[AttentionMetadata]: + return ROCMAiterMLASparseMetadata + + @staticmethod + def get_builder_cls() -> type["ROCMAiterMLASparseMetadataBuilder"]: + return ROCMAiterMLASparseMetadataBuilder + + @staticmethod + def get_impl_cls() -> type["ROCMAiterMLASparseImpl"]: + return ROCMAiterMLASparseImpl + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, # assumed to be 1 for MLA + head_size: int, + cache_dtype_str: str = "auto", + ) -> tuple[int, ...]: + return (num_blocks, block_size, head_size) + + @classmethod + def get_supported_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16] + + @classmethod + def get_supported_head_sizes(cls) -> list[int]: + return [576] + + +@dataclass +class ROCMAiterMLASparseMetadata: + num_reqs: int + max_query_len: int + max_seq_len: int + + num_actual_tokens: int # Number of tokens excluding padding. + query_start_loc: torch.Tensor + slot_mapping: torch.Tensor + + block_table: torch.Tensor + req_id_per_token: torch.Tensor + block_size: int = 1 + topk_tokens: int = 2048 + + +@dataclass +class ROCMAiterMLASparseMetadataBuilder( + AttentionMetadataBuilder[ROCMAiterMLASparseMetadata] +): + cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER + + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ): + self.kv_cache_spec = kv_cache_spec + self.model_config = vllm_config.model_config + parallel_config = vllm_config.parallel_config + self.device = device + + self.num_heads = self.model_config.get_num_attention_heads(parallel_config) + self.mla_dims = get_mla_dims(self.model_config) + self.topk_tokens = vllm_config.model_config.hf_config.index_topk + self.topk_tokens_tensor = torch.tensor( + [self.topk_tokens], device=device, dtype=torch.int32 + ) + self.max_model_len_tensor = torch.tensor( + [self.model_config.max_model_len], device=device, dtype=torch.int32 + ) + # this is ignored by `flash_mla_with_kvcache` if indices not None + self.dummy_block_table = torch.empty( + (1, 1), dtype=torch.int32, device=self.device + ) + + self.req_id_per_token_buffer = torch.empty( + (vllm_config.scheduler_config.max_num_batched_tokens,), + dtype=torch.int32, + device=device, + ) + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> ROCMAiterMLASparseMetadata: + num_tokens = common_attn_metadata.num_actual_tokens + starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32) + seg_lengths = np.diff(starts) + req_id_per_token = np.repeat( + np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths + ) + # Zero-fill for cudagraphs + self.req_id_per_token_buffer.fill_(0) + self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_( + torch.from_numpy(req_id_per_token), non_blocking=True + ) + req_id_per_token = self.req_id_per_token_buffer[:num_tokens] + + metadata = ROCMAiterMLASparseMetadata( + num_reqs=common_attn_metadata.num_reqs, + max_query_len=common_attn_metadata.max_query_len, + max_seq_len=common_attn_metadata.max_seq_len, + num_actual_tokens=common_attn_metadata.num_actual_tokens, + query_start_loc=common_attn_metadata.query_start_loc, + slot_mapping=common_attn_metadata.slot_mapping, + block_table=common_attn_metadata.block_table_tensor, + req_id_per_token=req_id_per_token, + block_size=self.kv_cache_spec.block_size, + topk_tokens=self.topk_tokens, + ) + return metadata + + +# Take from +# https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla_prefill.py#L72 +def reference_mla_sparse_prefill( + q: torch.Tensor, kv: torch.Tensor, indices: torch.Tensor, sm_scale: float, d_v: int +) -> tuple[torch.Tensor, torch.Tensor]: + import math + + def log2sumexp2(a: torch.Tensor, dim: int) -> torch.Tensor: + return torch.logsumexp(a * math.log(2), dim=dim) * math.log2(math.e) + + skv = kv.shape[0] + sq = q.shape[0] + topk = indices.shape[-1] + dqk = q.shape[-1] + indices = indices[:, 0, :] # [s_q, topk] + invalid_indices_mask = (indices < 0) | (indices >= skv) + indices[invalid_indices_mask] = 0 + qs = q # [s_q, h_q, d_qk] + kvs = kv[:, 0, :][indices].view(sq, topk, dqk) # [s_q, topk, d_qk] + + attn_score = (qs @ kvs.transpose(1, 2)).float() # [s_q, h_q, topk] + attn_score.masked_fill_(invalid_indices_mask.unsqueeze(1), float("-inf")) + attn_score *= sm_scale * math.log2(math.e) + lse = log2sumexp2(attn_score, dim=-1) # [s_q, h_q] + attn_score = torch.exp2(attn_score - lse.unsqueeze(-1)) # [s_q, h_q, topk] + result = attn_score.to(q.dtype) @ kvs[:, :, :d_v] + return (result, lse) + + +class ROCMAiterMLASparseImpl(MLACommonBaseImpl[ROCMAiterMLASparseMetadata]): + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: list[float] | None, + sliding_window: int | None, + kv_cache_dtype: str, + logits_soft_cap: float | None, + attn_type: str, + kv_sharing_target_layer_name: str | None, + # MLA Specific Arguments + topk_indice_buffer: torch.Tensor | None = None, + indexer: Optional["Indexer"] = None, + **mla_args, + ) -> None: + super().__init__( + num_heads, + head_size, + scale, + num_kv_heads, + alibi_slopes, + sliding_window, + kv_cache_dtype, + logits_soft_cap, + attn_type, + kv_sharing_target_layer_name, + **mla_args, + ) + self.softmax_scale = scale + assert indexer is not None + self.topk_indices_buffer = indexer.topk_indices_buffer + self.is_fp8bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled() + + def _forward_bf16_kv( + self, + q: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + topk_indices: torch.Tensor, + attn_metadata: ROCMAiterMLASparseMetadata, + ) -> torch.Tensor: + num_tokens = q.shape[0] + kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view( + -1, 1, kv_c_and_k_pe_cache.shape[-1] + ) + + topk_indices = topk_indices.view(num_tokens, 1, -1) + output = reference_mla_sparse_prefill( + q, kv_c_and_k_pe_cache, topk_indices, self.softmax_scale, 512 + )[0] + return output[:, : self.num_heads, :] + + def forward( + self, + layer: AttentionLayer, + q: torch.Tensor, + k_c_normed: torch.Tensor, # key in unified attn + k_pe: torch.Tensor, # value in unified attn + kv_cache: torch.Tensor, + attn_metadata: ROCMAiterMLASparseMetadata, + output: torch.Tensor | None = None, + output_scale: torch.Tensor | None = None, + output_block_scale: torch.Tensor | None = None, + ) -> torch.Tensor: + # NOTE(lucas): for the sparse FlashMLA kernels the kernels want to use + # MQA 576/512 approach for both prefill and decode + + assert output is not None, "Output tensor must be provided." + + if output_scale is not None or output_block_scale is not None: + raise NotImplementedError( + "fused output quantization is not yet supported for ROCMAiterMLASparse" + ) + + if attn_metadata is None: + # The zero fill is required when used with DP + EP + # to ensure all ranks within a DP group compute the + # same expert outputs. + return output.fill_(0) + + num_actual_toks = attn_metadata.num_actual_tokens + + # Inputs and outputs may be padded for CUDA graphs + + q = q[:num_actual_toks, ...] + k_c_normed = k_c_normed[:num_actual_toks, ...] + k_pe = k_pe[:num_actual_toks, ...] + + q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + # Convert from (B, N, P) to (N, B, P) + q_nope = q_nope.transpose(0, 1) + if self.is_fp8bmm_enabled: + # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L) + ql_nope = rocm_aiter_ops.triton_fp8_bmm( + q_nope, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True + ) + else: + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + ql_nope = torch.bmm(q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + ql_nope = ql_nope.transpose(0, 1) + + topk_indices = self.topk_indices_buffer[:num_actual_toks] + + topk_indices_global = triton_convert_req_index_to_global_index( + attn_metadata.req_id_per_token, + attn_metadata.block_table, + topk_indices, + BLOCK_SIZE=attn_metadata.block_size, + NUM_TOPK_TOKENS=attn_metadata.topk_tokens, + ) + + q = torch.cat([ql_nope, q_pe], dim=-1) + + # write the latent and rope to kv cache + if kv_cache.numel() > 0: + ops.concat_and_cache_mla( + k_c_normed, + k_pe.squeeze(1), + kv_cache, + attn_metadata.slot_mapping.flatten(), + kv_cache_dtype=self.kv_cache_dtype, + scale=layer._k_scale, + ) + + attn_out = self._forward_bf16_kv( + q, kv_cache, topk_indices_global, attn_metadata + ) + + self._v_up_proj(attn_out, out=output[:num_actual_toks]) + return output diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 095407a8b9596..9e99ea964ee08 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -316,7 +316,7 @@ def bind_kv_cache( # TODO - analyze where runner_kv_caches is used and the right # way to ensure it properly reflects multiple attention layers # in the same decoder block. - if current_platform.is_cuda() or current_platform.is_xpu(): + if current_platform.is_cuda_alike() or current_platform.is_xpu(): # We know that the GPU runner is not impacted by this # case. Some test code depends on runner_kv_caches, but # not in a way that's impacted by ignoring this. From c0c2dd1e0b75c70706f4d8dbcd1d75f1c1750e14 Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Thu, 20 Nov 2025 12:55:10 +0200 Subject: [PATCH 069/249] [BugFix] kv_offloading: Fix bug in loading of partial cpu blocks (#28951) Signed-off-by: Or Ozeri Co-authored-by: Cyrus Leung --- tests/v1/kv_offload/test_cpu_gpu.py | 4 ++-- vllm/v1/kv_offload/worker/cpu_gpu.py | 20 +++++++++----------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index 0d4fa344d298c..a248104e16d2d 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -103,8 +103,8 @@ def test_transfer( for i in range(gpu_blocks_per_cpu_block): cpu_blocks_in_gpu_block_size.append(i + base_block_id) - # maybe skip a GPU block to test writing to the middle of a CPU block - if gpu_to_cpu: + # maybe skip a GPU block to test reading from the middle of a CPU block + if not gpu_to_cpu: gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :] cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[ gpu_blocks_per_cpu_block - 1 : diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 0f2ec4a1b41f3..111046377a5da 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -135,22 +135,20 @@ class CpuGpuOffloadingHandler(OffloadingHandler): assert src_blocks.ndim == 1 assert dst_blocks.ndim == 1 - dst_sub_blocks_to_skip = -src_blocks.size % dst_block_size_factor src_sub_block_count = src_blocks.size * src_block_size_factor + dst_sub_block_count = dst_blocks.size * dst_block_size_factor + src_sub_blocks_to_skip = -dst_blocks.size % src_block_size_factor - assert ( - src_sub_block_count - == dst_blocks.size * dst_block_size_factor - dst_sub_blocks_to_skip - ) + assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip - src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64) - expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0]) + src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64) expand_block_ids( - dst_blocks, - dst_block_size_factor, - src_to_dst[:, 1], - skip_count=dst_sub_blocks_to_skip, + src_blocks, + src_block_size_factor, + src_to_dst[:, 0], + skip_count=src_sub_blocks_to_skip, ) + expand_block_ids(dst_blocks, dst_block_size_factor, src_to_dst[:, 1]) src_to_dst_tensor = torch.from_numpy(src_to_dst) event = self.events_pool.pop() if self.events_pool else torch.Event() From c9e093116c00781dda86df7a77e976c614b35d51 Mon Sep 17 00:00:00 2001 From: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:00:19 +0900 Subject: [PATCH 070/249] [MODEL] Implement plamo3 (#28834) Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com> --- docs/models/supported_models.md | 1 + tests/distributed/test_pipeline_parallel.py | 1 + tests/models/registry.py | 4 + vllm/model_executor/models/plamo3.py | 431 ++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 5 files changed, 438 insertions(+) create mode 100644 vllm/model_executor/models/plamo3.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 80fe143269a76..f0531ced0aaa3 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -436,6 +436,7 @@ th { | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | +| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | | ✅︎ | | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 0ab94d30858fb..89f035d2cdd6f 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -130,6 +130,7 @@ TEXT_GENERATION_MODELS = { "inceptionai/jais-13b-chat": PPTestSettings.fast(), "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(), "pfnet/plamo-2-1b": PPTestSettings.fast(), + "pfnet/plamo-3-nict-2b-base": PPTestSettings.fast(), "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(), # Tests TransformersForCausalLM "hmellor/Ilama-3.2-1B": PPTestSettings.fast(), diff --git a/tests/models/registry.py b/tests/models/registry.py index 094f921e4305f..1999e3cd2de2d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -402,6 +402,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "pfnet/plamo-2-1b", trust_remote_code=True, ), + "Plamo3ForCausalLM": _HfExamplesInfo( + "pfnet/plamo-3-nict-2b-base", + trust_remote_code=True, + ), "QWenLMHeadModel": _HfExamplesInfo( "Qwen/Qwen-7B-Chat", max_transformers_version="4.53", diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py new file mode 100644 index 0000000000000..5bb07722a5fc1 --- /dev/null +++ b/vllm/model_executor/models/plamo3.py @@ -0,0 +1,431 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inference-only PLaMo3 model.""" + +from collections.abc import Iterable +from itertools import islice +from typing import Any + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_pp_group +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + LoaderFunction, + composed_weight_loader, + default_weight_loader, +) +from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + extract_layer_index, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.sequence import IntermediateTensors + + +# Only used for type hinting. +class Plamo3Config(PretrainedConfig): # type: ignore + model_type: str = "plamo3" + + hidden_size: int + num_hidden_layers: int + rms_norm_eps: float + # Attention + num_attention_heads: int + head_dim: int + num_key_value_heads: int + # vllm rename `sliding_window` attr to `interleaved_sliding_window` + # if `sliding_window` is list + interleaved_sliding_window: list[int | None] + sliding_window_pattern: int + rope_theta: int + rope_local_theta: int + # MLP + intermediate_size: int + # Tokenizer + vocab_size: int + + +def rms_norm_weight_loader(offset: float) -> LoaderFunction: + return composed_weight_loader( + default_weight_loader, + lambda x: x + offset, + ) + + +class DenseMLP(nn.Module): + def __init__( + self, + config: Plamo3Config, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_up_proj = MergedColumnParallelLinear( + self.hidden_size, + [self.intermediate_size] * 2, + bias=False, + prefix=f"{prefix}.gate_up_proj", + quant_config=quant_config, + return_bias=False, + ) + self.act = SiluAndMul() + self.down_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + bias=False, + prefix=f"{prefix}.down_proj", + quant_config=quant_config, + return_bias=False, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + h = self.gate_up_proj(hidden_states) + h = self.act(h) + return self.down_proj(h) + + +class Plamo3AttentionMixer(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None: + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.hidden_size = config.hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = config.num_attention_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = config.num_key_value_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = config.head_dim + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + + self.qkv_proj = QKVParallelLinear( + config.hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + layer_idx = extract_layer_index(prefix) + full_attn = config.interleaved_sliding_window[layer_idx] is None + + self.rope_theta = config.rope_theta if full_attn else config.rope_local_theta + self.rope_scaling = ( + config.rope_scaling if hasattr(config, "rope_scaling") else None + ) + max_position = config.max_position_embeddings + if hasattr(vllm_config.model_config, "max_model_len") and isinstance( + vllm_config.model_config.max_model_len, int + ): + max_position = min(max_position, vllm_config.model_config.max_model_len) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position, + base=self.rope_theta, + rope_scaling=self.rope_scaling, + ) + self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + set_weight_attrs( + self.q_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)} + ) + self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + set_weight_attrs( + self.k_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)} + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=vllm_config.cache_config, + per_layer_sliding_window=config.interleaved_sliding_window[layer_idx], + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + **kwargs: Any, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + q_shape = q.shape + q = q.reshape(q_shape[:-1] + (q_shape[-1] // self.head_dim, self.head_dim)) + q = self.q_norm.forward_native(q).reshape(q_shape) + k_shape = k.shape + k = k.reshape(k_shape[:-1] + (k_shape[-1] // self.head_dim, self.head_dim)) + k = self.k_norm.forward_native(k).reshape(k_shape) + + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class Plamo3DecoderLayer(nn.Module): + def __init__( + self, vllm_config: VllmConfig, prefix: str = "", **kwargs: Any + ) -> None: + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.mixer = Plamo3AttentionMixer( + vllm_config=vllm_config, + prefix=f"{prefix}.mixer", + ) + + self.mlp = DenseMLP( + config=config, quant_config=quant_config, prefix=f"{prefix}.mlp" + ) + self.pre_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.pre_mixer_norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0)}, + ) + self.post_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.post_mixer_norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0 / 5)}, + ) + self.pre_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.pre_mlp_norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0)}, + ) + self.post_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.post_mlp_norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0 / (5**1.5))}, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + **kwargs: Any, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + if residual is None: + residual = hidden_states + hidden_states = self.pre_mixer_norm(hidden_states) + else: + hidden_states, residual = self.pre_mixer_norm(hidden_states, residual) + + hidden_states = self.mixer( + positions=positions, hidden_states=hidden_states, residual=residual + ) + hidden_states = self.post_mixer_norm(hidden_states) + # Fully Connected + hidden_states, residual = self.pre_mlp_norm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_mlp_norm(hidden_states) + return hidden_states, residual + + +class Plamo3Decoder(torch.nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers + + self.start_layer, self.end_layer, self.layers = make_layers( + num_hidden_layers, + lambda prefix: Plamo3DecoderLayer(vllm_config, prefix=prefix), + prefix=f"{prefix}.layers", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + for layer in islice(self.layers, self.start_layer, self.end_layer): + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + ) + return hidden_states, residual + + +@support_torch_compile +class Plamo3Model(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.org_vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + prefix=f"{prefix}.embed_tokens", + ) + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + self.layers = Plamo3Decoder(vllm_config, prefix=f"{prefix}.layers") + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + set_weight_attrs( + self.norm.weight, + {"weight_loader": rms_norm_weight_loader(offset=1.0)}, + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + hidden_states, residual = self.layers( + positions=positions, hidden_states=hidden_states, residual=residual + ) + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class Plamo3ForCausalLM(nn.Module, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + self.config = vllm_config.model_config.hf_config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.scheduler_config = vllm_config.scheduler_config + + self.model = Plamo3Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + + self.vocab_size = self.config.vocab_size + self.unpadded_vocab_size = self.config.vocab_size + + num_embeddings = ((self.vocab_size + 15) // 16) * 16 + self.lm_head = ParallelLMHead( + num_embeddings, + self.config.hidden_size, + org_num_embeddings=self.config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + prefix=f"{prefix}.lm_head", + ) + if self.config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) + + self.logits_processor = LogitsProcessor( + self.unpadded_vocab_size, self.config.vocab_size + ) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor: + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a2de597c87d88..4943987606201 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -157,6 +157,7 @@ _TEXT_GENERATION_MODELS = { "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"), + "Plamo3ForCausalLM": ("plamo3", "Plamo3ForCausalLM"), "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), From 371b1d4c61335ed4c1d7fb2acee75274cc6d4551 Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Thu, 20 Nov 2025 19:01:03 +0800 Subject: [PATCH 071/249] [RL] Add Pause and Resume Generation for Asynchronous RL Training (#28037) Signed-off-by: SamitHuang <285365963@qq.com> Signed-off-by: Samit <285365963@qq.com> Signed-off-by: samithuang <285365963@qq.com> Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- vllm/engine/protocol.py | 27 ++++++++++ vllm/entrypoints/openai/api_server.py | 78 +++++++++++++++++++++++++++ vllm/v1/engine/async_llm.py | 64 ++++++++++++++++++++++ vllm/v1/engine/output_processor.py | 13 +++++ 4 files changed, 182 insertions(+) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 462d2c4e50e73..5e3374f9f6a10 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -149,6 +149,33 @@ class EngineClient(ABC): """Load a new LoRA adapter into the engine for future requests.""" ... + @abstractmethod + async def pause_generation( + self, + *, + wait_for_inflight_requests: bool = False, + clear_cache: bool = True, + ) -> None: + """Pause new generation/encoding requests. + + Args: + wait_for_inflight_requests: When ``True`` waits for in-flight requests + to finish before pausing. When ``False`` (default), aborts in-flight + requests immediately. + clear_cache: Whether to clear KV and prefix caches after draining. + """ + ... + + @abstractmethod + async def resume_generation(self) -> None: + """Resume accepting generation/encoding requests.""" + ... + + @abstractmethod + async def is_paused(self) -> bool: + """Return whether the engine is currently paused.""" + ... + async def scale_elastic_ep( self, new_data_parallel_size: int, drain_timeout: int = 300 ) -> None: diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3974f45a7135c..70174250ceabe 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -394,6 +394,84 @@ async def get_server_load_metrics(request: Request): return JSONResponse(content={"server_load": request.app.state.server_load_metrics}) +@router.post("/pause") +async def pause_generation( + raw_request: Request, + wait_for_inflight_requests: bool = Query(False), + clear_cache: bool = Query(True), +) -> JSONResponse: + """Pause generation requests to allow weight updates. + + Args: + wait_for_inflight_requests: When ``True`` waits for in-flight + requests to finish before pausing. When ``False`` (default), + aborts any in-flight requests immediately. + clear_cache: Whether to clear KV/prefix caches after draining. + """ + + engine = engine_client(raw_request) + + try: + await engine.pause_generation( + wait_for_inflight_requests=wait_for_inflight_requests, + clear_cache=clear_cache, + ) + return JSONResponse( + content={"status": "paused"}, + status_code=HTTPStatus.OK.value, + ) + + except ValueError as err: + return JSONResponse( + content={"error": str(err)}, + status_code=HTTPStatus.BAD_REQUEST.value, + ) + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to pause generation") + return JSONResponse( + content={"error": f"Failed to pause generation: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + +@router.post("/resume") +async def resume_generation(raw_request: Request) -> JSONResponse: + """Resume generation after a pause.""" + + engine = engine_client(raw_request) + + try: + await engine.resume_generation() + return JSONResponse( + content={"status": "resumed"}, + status_code=HTTPStatus.OK.value, + ) + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to resume generation") + return JSONResponse( + content={"error": f"Failed to resume generation: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + +@router.get("/is_paused") +async def is_paused(raw_request: Request) -> JSONResponse: + """Return the current pause status.""" + + engine = engine_client(raw_request) + + try: + paused = await engine.is_paused() + except Exception as err: # pragma: no cover - defensive + logger.exception("Failed to fetch pause status") + return JSONResponse( + content={"error": f"Failed to fetch pause status: {err}"}, + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + ) + + return JSONResponse(content={"is_paused": paused}) + + @router.post( "/tokenize", dependencies=[Depends(validate_json_request)], diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index abf2c8cfa4539..c64b3cccfc652 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -152,6 +152,10 @@ class AsyncLLM(EngineClient): ) self.logger_manager.log_engine_initialized() + # Pause / resume state for async RL workflows. + self._pause_cond = asyncio.Condition() + self._paused = False + self.output_handler: asyncio.Task | None = None try: # Start output handler eagerly if we are in the asyncio eventloop. @@ -404,6 +408,10 @@ class AsyncLLM(EngineClient): # to handle startup failure gracefully in the OpenAI server. self._run_output_handler() + # Wait until generation is resumed if the engine is paused. + async with self._pause_cond: + await self._pause_cond.wait_for(lambda: not self._paused) + if tokenization_kwargs is None: tokenization_kwargs = {} truncate_prompt_tokens = sampling_params.truncate_prompt_tokens @@ -551,6 +559,58 @@ class AsyncLLM(EngineClient): if self.log_requests: logger.info("Aborted request(s) %s.", ",".join(request_ids)) + async def pause_generation( + self, + *, + wait_for_inflight_requests: bool = False, + clear_cache: bool = True, + ) -> None: + """ + Pause generation to allow model weight updates. + + New generation/encoding requests are blocked until resume. + + Args: + wait_for_inflight_requests: When ``True`` waits for in-flight + requests to finish before pausing. When ``False`` (default), + immediately aborts any in-flight requests. + clear_cache: Whether to clear KV cache and prefix cache after + draining. Set to ``False`` to preserve cache for faster resume. + Default is ``True`` (clear caches). + """ + + async with self._pause_cond: + if self._paused: + return + self._paused = True + + if not wait_for_inflight_requests: + request_ids = list(self.output_processor.request_states.keys()) + if request_ids: + await self.abort(request_ids) + + # Wait for running requests to drain before clearing cache. + if self.output_processor.has_unfinished_requests(): + await self.output_processor.wait_for_requests_to_drain() + + # Clear cache + if clear_cache: + await self.reset_prefix_cache() + await self.reset_mm_cache() + + async def resume_generation(self) -> None: + """Resume generation after :meth:`pause_generation`.""" + + async with self._pause_cond: + self._paused = False + self._pause_cond.notify_all() # Wake up all waiting requests + + async def is_paused(self) -> bool: + """Return whether the engine is currently paused.""" + + async with self._pause_cond: + return self._paused + async def encode( self, prompt: PromptType, @@ -582,6 +642,10 @@ class AsyncLLM(EngineClient): # to handle startup failure gracefully in the OpenAI server. self._run_output_handler() + # Respect pause state before accepting new requests. + async with self._pause_cond: + await self._pause_cond.wait_for(lambda: not self._paused) + if tokenization_kwargs is None: tokenization_kwargs = {} _validate_truncation_size( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index bdbbfe2595f81..0453c4a77f0cd 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -350,6 +350,8 @@ class OutputProcessor: self.parent_requests: dict[str, ParentRequest] = {} self.lora_states = LoRARequestStates(log_stats) self.tracer: Tracer | None = None + self._requests_drained = asyncio.Event() + self._requests_drained.set() def get_num_unfinished_requests(self): return len(self.request_states) @@ -357,6 +359,11 @@ class OutputProcessor: def has_unfinished_requests(self) -> bool: return len(self.request_states) > 0 + async def wait_for_requests_to_drain(self) -> None: + if not self.request_states: + return + await self._requests_drained.wait() + def propagate_error(self, e: Exception): """Propagate error to all generate() tasks.""" @@ -396,6 +403,8 @@ class OutputProcessor: child_reqs = self.abort_requests(child_reqs) request_ids_to_abort.extend(child_reqs) self.parent_requests.pop(request_id, None) + if not self.request_states: + self._requests_drained.set() return request_ids_to_abort def add_request( @@ -420,6 +429,8 @@ class OutputProcessor: log_stats=self.log_stats, stream_interval=self.stream_interval, ) + if self._requests_drained.is_set(): + self._requests_drained.clear() self.request_states[request_id] = req_state if parent_req: self.parent_requests[parent_req.request_id] = parent_req @@ -511,6 +522,8 @@ class OutputProcessor: parent_req = req_state.parent_req if parent_req and not parent_req.child_requests: self.parent_requests.pop(parent_req.request_id, None) + if not self.request_states: + self._requests_drained.set() if not engine_core_output.finished: # If req not finished in EngineCore, but Detokenizer # detected stop string, abort needed in EngineCore. From 93c8672ceb06f6e9c282a96fcd85a7ce41293693 Mon Sep 17 00:00:00 2001 From: Zhewen Li Date: Thu, 20 Nov 2025 03:05:50 -0800 Subject: [PATCH 072/249] [Bugfix] Fix spec decode memory regression after #28549 (#28819) Signed-off-by: zhewenli --- vllm/model_executor/models/deepseek_eagle.py | 5 ----- vllm/model_executor/models/llama4_eagle.py | 7 ------- vllm/model_executor/models/llama_eagle.py | 5 ----- vllm/v1/spec_decode/eagle.py | 7 +++++-- 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py index 3fb04c3b70dd1..4d7a37292cb02 100644 --- a/vllm/model_executor/models/deepseek_eagle.py +++ b/vllm/model_executor/models/deepseek_eagle.py @@ -8,7 +8,6 @@ import torch.nn as nn from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig -from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -172,10 +171,6 @@ class DeepseekV2Model(nn.Module): ) break else: - # if PP disabled then draft will share embed with target - if get_pp_group().world_size == 1 and "embed_tokens." in name: - continue - # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index 660c8f1bb5226..0146b30579287 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -23,7 +23,6 @@ import torch.nn as nn from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig -from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -127,17 +126,11 @@ class LlamaModel(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: - # if PP disabled then draft will share embed with target - if get_pp_group().world_size == 1 and "embed_tokens." in name: - continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) for name in params_dict: - # if PP disabled then draft will share embed with target - if get_pp_group().world_size == 1 and "embed_tokens." in name: - continue assert name in loaded_params, f"{name} is not loaded!" return loaded_params diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 90ab5c50361b6..05cb456e7776e 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -9,7 +9,6 @@ from transformers import LlamaConfig from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig -from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -155,10 +154,6 @@ class LlamaModel(nn.Module): weight_loader(param, loaded_weight, shard_id) break else: - # if PP disabled then draft will share embed with target - if get_pp_group().world_size == 1 and "embed_tokens." in name: - continue - param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 406bb696bd4cf..ba37bc81607fe 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1028,8 +1028,11 @@ class EagleProposer: elif ( isinstance(target_embed_tokens.weight, torch.Tensor) and isinstance(self.model.model.embed_tokens.weight, torch.Tensor) - and torch.equal( - target_embed_tokens.weight, self.model.model.embed_tokens.weight + and torch.allclose( + target_embed_tokens.weight.cpu(), + self.model.model.embed_tokens.weight.cpu(), + rtol=1e-5, + atol=1e-7, ) ): share_embeddings = True From a2e9ebe9e242295a58e400835ef98a14b29c4fb0 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Thu, 20 Nov 2025 20:14:29 +0800 Subject: [PATCH 073/249] [BugFix] Fix flash_attn import in `siglip2navit.py` (#29082) Signed-off-by: Fanli Lin --- vllm/model_executor/models/siglip2navit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py index 46f5e67d659ef..c185b45345bd5 100644 --- a/vllm/model_executor/models/siglip2navit.py +++ b/vllm/model_executor/models/siglip2navit.py @@ -191,7 +191,7 @@ def apply_rotary_pos_emb( cos = cos.chunk(2, dim=-1)[0].contiguous() sin = sin.chunk(2, dim=-1)[0].contiguous() if is_flash_attn_backend and not current_platform.is_xpu(): - from flash_attn.layers.rotary import apply_rotary_emb + from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb apply_rotary_emb_func = apply_rotary_emb else: From 82b05b15e61badfd0c5912d4c3eebc88043c9ef8 Mon Sep 17 00:00:00 2001 From: TJian Date: Thu, 20 Nov 2025 23:34:11 +0700 Subject: [PATCH 074/249] [BugFix] [FEAT] Enable fastsafetensors for ROCm platform (#28225) Signed-off-by: tjtanaa --- requirements/rocm.txt | 1 + .../fastsafetensors_loader/test_fastsafetensors_loader.py | 3 ++- .../model_loader/fastsafetensors_loader/test_weight_utils.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 6f1cca90e5e2b..abbd33d6e1240 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -15,3 +15,4 @@ setuptools-scm>=8 runai-model-streamer[s3,gcs]==0.15.0 conch-triton-kernels==1.2.1 timm>=1.0.17 +fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459 diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py index f154df6dfc232..c5b3c731ffc64 100644 --- a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py +++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py @@ -19,7 +19,8 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0) @pytest.mark.skipif( - not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs" + not current_platform.is_cuda_alike(), + reason="fastsafetensors requires NVIDIA/AMD GPUs", ) def test_model_loader_download_files(vllm_runner): with vllm_runner(test_model, load_format="fastsafetensors") as llm: diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py index bd216f0e41a47..1975eb61b25da 100644 --- a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py +++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py @@ -17,7 +17,8 @@ from vllm.platforms import current_platform @pytest.mark.skipif( - not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs" + not current_platform.is_cuda_alike(), + reason="fastsafetensors requires NVIDIA/AMD GPUs", ) def test_fastsafetensors_model_loader(): with tempfile.TemporaryDirectory() as tmpdir: From 56f45eddaff817ec7118bf9a73c5e4b560738bed Mon Sep 17 00:00:00 2001 From: rookie <66160395+zhanggzh@users.noreply.github.com> Date: Fri, 21 Nov 2025 01:02:30 +0800 Subject: [PATCH 075/249] [Frontend] Optimize beam search loop by sorting and then splicing (#19347) Signed-off-by: zhangguozhu Signed-off-by: mgoin Co-authored-by: zhangguozhu Co-authored-by: mgoin --- vllm/entrypoints/openai/serving_engine.py | 101 +++++++++++++++------- 1 file changed, 69 insertions(+), 32 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index c50b0c4a23e17..127b8e6dcb87c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -10,6 +10,7 @@ from concurrent.futures import ThreadPoolExecutor from http import HTTPStatus from typing import Any, ClassVar, Generic, TypeAlias, TypeVar +import numpy as np import torch from fastapi import Request from pydantic import BaseModel, ConfigDict, Field, TypeAdapter @@ -389,8 +390,9 @@ class OpenAIServing: sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty) + logprobs_num = 2 * beam_width beam_search_params = SamplingParams( - logprobs=2 * beam_width, + logprobs=logprobs_num, max_tokens=1, temperature=temperature, ) @@ -443,40 +445,75 @@ class OpenAIServing: output = [x[0] for x in await asyncio.gather(*tasks)] new_beams = [] - for i, current_beam in enumerate(all_beams): - result = output[i] - + # Store all new tokens generated by beam + all_beams_token_id = [] + # Store the cumulative probability of all tokens + # generated by beam search + all_beams_logprob = [] + # Iterate through all beam inference results + for i, result in enumerate(output): + current_beam = all_beams[i] if result.outputs[0].logprobs is not None: logprobs = result.outputs[0].logprobs[0] - for token_id, logprob_obj in logprobs.items(): - if token_id == eos_token_id and not ignore_eos: - completed.append( - BeamSearchSequence( - tokens=current_beam.tokens + [token_id] - if include_stop_str_in_output - else current_beam.tokens, - logprobs=current_beam.logprobs + [logprobs], - cum_logprob=current_beam.cum_logprob - + logprob_obj.logprob, - finish_reason="stop", - stop_reason=eos_token_id, - ) - ) - else: - new_beams.append( - BeamSearchSequence( - tokens=current_beam.tokens + [token_id], - logprobs=current_beam.logprobs + [logprobs], - lora_request=current_beam.lora_request, - cum_logprob=current_beam.cum_logprob - + logprob_obj.logprob, - multi_modal_data=current_beam.multi_modal_data, - mm_processor_kwargs=current_beam.mm_processor_kwargs, - ) - ) + all_beams_token_id.extend(list(logprobs.keys())) + all_beams_logprob.extend( + [ + current_beam.cum_logprob + obj.logprob + for obj in logprobs.values() + ] + ) - sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True) - all_beams = sorted_beams[:beam_width] + # Handle the token for the end of sentence (EOS) + all_beams_token_id = np.array(all_beams_token_id) + all_beams_logprob = np.array(all_beams_logprob) + + if not ignore_eos: + # Get the index position of eos token in all generated results + eos_idx = np.where(all_beams_token_id == eos_token_id)[0] + for idx in eos_idx: + current_beam = all_beams[idx // logprobs_num] + result = output[idx // logprobs_num] + assert result.outputs[0].logprobs is not None + logprobs_entry = result.outputs[0].logprobs[0] + completed.append( + BeamSearchSequence( + tokens=current_beam.tokens + [eos_token_id] + if include_stop_str_in_output + else current_beam.tokens, + logprobs=current_beam.logprobs + [logprobs_entry], + cum_logprob=float(all_beams_logprob[idx]), + finish_reason="stop", + stop_reason=eos_token_id, + ) + ) + # After processing, set the log probability of the eos condition + # to negative infinity. + all_beams_logprob[eos_idx] = -np.inf + + # Processing non-EOS tokens + # Get indices of the top beam_width probabilities + topn_idx = np.argpartition(np.negative(all_beams_logprob), beam_width)[ + :beam_width + ] + + for idx in topn_idx: + current_beam = all_beams[idx // logprobs_num] + result = output[idx // logprobs_num] + token_id = int(all_beams_token_id[idx]) + assert result.outputs[0].logprobs is not None + logprobs_entry = result.outputs[0].logprobs[0] + new_beams.append( + BeamSearchSequence( + tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + [logprobs_entry], + lora_request=current_beam.lora_request, + cum_logprob=float(all_beams_logprob[idx]), + multi_modal_data=current_beam.multi_modal_data, + mm_processor_kwargs=current_beam.mm_processor_kwargs, + ) + ) + + all_beams = new_beams completed.extend(all_beams) sorted_completed = sorted(completed, key=sort_beams_key, reverse=True) From 22924383e14a7a37ee86cf6e15f39e13efc86f7c Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Thu, 20 Nov 2025 11:07:06 -0600 Subject: [PATCH 076/249] Updating the mirror of test-amd.yaml as of 2025-11-18 (#29016) Signed-off-by: Alexei V. Ivanov --- .buildkite/test-amd.yaml | 41 ++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 37c6bd4276722..4e2ff5c5a6bd5 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -61,7 +61,7 @@ steps: - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ -- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins +- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins timeout_in_minutes: 10 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -73,6 +73,7 @@ steps: - tests/multimodal - tests/standalone_tests/lazy_imports.py - tests/transformers_utils + - tests/config no_gpu: true commands: - python3 standalone_tests/lazy_imports.py @@ -80,6 +81,7 @@ steps: - pytest -v -s test_outputs.py - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s transformers_utils + - pytest -v -s config - label: Python-only Installation Test # 10min timeout_in_minutes: 20 @@ -390,6 +392,15 @@ steps: commands: - pytest -v -s v1/attention +- label: V1 Test attention (B200) # 10min + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this + - label: V1 Test others (CPU) # 5 mins mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 @@ -529,7 +540,7 @@ steps: - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # Limit to no custom ops to reduce running time # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Cudagraph test timeout_in_minutes: 20 @@ -694,7 +705,7 @@ steps: - vllm/model_executor/models/whisper.py commands: # LMEval # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - - pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py + - pytest -s entrypoints/openai/correctness/ - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 @@ -995,12 +1006,12 @@ steps: optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)' - pytest -v -s tests/models/test_transformers.py - - pytest -v -s tests/models/multimodal/processing/ - - pytest -v -s tests/models/multimodal/test_mapping.py + # - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)' - python3 examples/offline_inference/basic/chat.py - - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper @@ -1045,7 +1056,7 @@ steps: - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - pytest -v -s tests/kernels/moe/test_flashinfer.py -- label: Blackwell Fusion Tests # 30 min +- label: Blackwell Fusion and Compile Tests # 30 min timeout_in_minutes: 40 working_dir: "/vllm-workspace/" gpu: b200 @@ -1066,7 +1077,9 @@ steps: - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile - label: Blackwell Fusion E2E Tests # 30 min timeout_in_minutes: 40 @@ -1088,15 +1101,13 @@ steps: commands: - nvidia-smi # Run all e2e fusion tests - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/test_fusions_e2e.py - label: ROCm GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" agent_pool: mi325_1 - mirror_hardwares: [amdproduction] + mirror_hardwares: [amdexperimental, amdproduction] optional: true # run on nightlies source_file_dependencies: - tests/evals/gpt_oss @@ -1416,7 +1427,9 @@ steps: - pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + - pytest -v -s tests/compile/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 - pytest -v -s tests/v1/distributed/test_dbo.py From e5bfcb6a88cda4f91e3c7074d7e76bb5d1d36362 Mon Sep 17 00:00:00 2001 From: Pan Li <1162953505@qq.com> Date: Fri, 21 Nov 2025 01:38:31 +0800 Subject: [PATCH 077/249] [BugFix][PD]: make example proxy usable with P2pNcclConnector (#26628) Signed-off-by: PAN <1162953505@qq.com> --- .../disagg_prefill_proxy_server.py | 245 +++++++++++------- .../online_serving/disaggregated_prefill.sh | 19 +- 2 files changed, 167 insertions(+), 97 deletions(-) diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index 904f805349148..d072c03c440b2 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -5,11 +5,12 @@ import argparse import asyncio import logging import os +import time +import uuid +from urllib.parse import urlparse import aiohttp from quart import Quart, Response, make_response, request -from rate_limiter import RateLimiter -from request_queue import RequestQueue # Configure logging logging.basicConfig(level=logging.INFO) @@ -24,26 +25,8 @@ def parse_args(): parser.add_argument( "--timeout", type=float, - default=300, - help="Timeout for backend service requests in seconds (default: 300)", - ) - parser.add_argument( - "--max-concurrent", - type=int, - default=100, - help="Maximum concurrent requests to backend services (default: 100)", - ) - parser.add_argument( - "--queue-size", - type=int, - default=500, - help="Maximum number of requests in the queue (default: 500)", - ) - parser.add_argument( - "--rate-limit", - type=int, - default=40, - help="Maximum requests per second (default: 40)", + default=6 * 60 * 60, + help="Timeout for backend service requests in seconds (default: 21600)", ) parser.add_argument( "--port", @@ -54,14 +37,32 @@ def parse_args(): parser.add_argument( "--prefill-url", type=str, - default="http://localhost:8100/v1/completions", - help="Prefill service endpoint URL", + default="http://localhost:8100", + help="Prefill service base URL (protocol + host[:port])", ) parser.add_argument( "--decode-url", type=str, - default="http://localhost:8200/v1/completions", - help="Decode service endpoint URL", + default="http://localhost:8200", + help="Decode service base URL (protocol + host[:port])", + ) + parser.add_argument( + "--kv-host", + type=str, + default="localhost", + help="Hostname or IP used by KV transfer (default: localhost)", + ) + parser.add_argument( + "--prefill-kv-port", + type=int, + default=14579, + help="Prefill KV port (default: 14579)", + ) + parser.add_argument( + "--decode-kv-port", + type=int, + default=14580, + help="Decode KV port (default: 14580)", ) return parser.parse_args() @@ -73,70 +74,129 @@ def main(): # Initialize configuration using command line parameters AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout) - MAX_CONCURRENT_REQUESTS = args.max_concurrent - REQUEST_QUEUE_SIZE = args.queue_size - RATE_LIMIT = args.rate_limit PREFILL_SERVICE_URL = args.prefill_url DECODE_SERVICE_URL = args.decode_url PORT = args.port + PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}" + DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}" + + logger.info( + "Proxy resolved KV addresses -> prefill: %s, decode: %s", + PREFILL_KV_ADDR, + DECODE_KV_ADDR, + ) + app = Quart(__name__) - # Initialize the rate limiter and request queue - rate_limiter = RateLimiter(RATE_LIMIT) - request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE) - - # Attach the configuration object to the application instance + # Attach the configuration object to the application instance so helper + # coroutines can read the resolved backend URLs and timeouts without using + # globals. app.config.update( { "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT, - "rate_limiter": rate_limiter, - "request_queue": request_queue, "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL, "DECODE_SERVICE_URL": DECODE_SERVICE_URL, + "PREFILL_KV_ADDR": PREFILL_KV_ADDR, + "DECODE_KV_ADDR": DECODE_KV_ADDR, } ) - # Start queue processing on app startup - @app.before_serving - async def startup(): - """Start request processing task when app starts serving""" - asyncio.create_task(request_queue.process()) + def _normalize_base_url(url: str) -> str: + """Remove any trailing slash so path joins behave predictably.""" + return url.rstrip("/") - async def forward_request(url, data): - """Forward request to backend service with rate limiting and error handling""" - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + def _get_host_port(url: str) -> str: + """Return the hostname:port portion for logging and KV headers.""" + parsed = urlparse(url) + host = parsed.hostname or "localhost" + port = parsed.port + if port is None: + port = 80 if parsed.scheme == "http" else 443 + return f"{host}:{port}" - # Use rate limiter as context manager - async with ( - rate_limiter, - aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, - ): - try: - async with session.post( - url=url, json=data, headers=headers - ) as response: - if response.status == 200: - # Stream response chunks - async for chunk_bytes in response.content.iter_chunked(1024): - yield chunk_bytes - else: - # Handle backend service errors - error_text = await response.text() - logger.error( - "Backend service error: %s - %s", - response.status, - error_text, - ) - yield b'{"error": "Backend service error"}' - except aiohttp.ClientError as e: - # Handle connection errors - logger.error("Connection error to %s: %s", url, str(e)) - yield b'{"error": "Service unavailable"}' - except asyncio.TimeoutError: - # Handle timeout errors - logger.error("Timeout connecting to %s", url) - yield b'{"error": "Service timeout"}' + PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL) + DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL) + KV_TARGET = _get_host_port(DECODE_SERVICE_URL) + + def _build_headers(request_id: str) -> dict[str, str]: + """Construct the headers expected by vLLM's P2P disagg connector.""" + headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET} + api_key = os.environ.get("OPENAI_API_KEY") + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + return headers + + async def _run_prefill( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{PREFILL_BASE}{request_path}" + start_ts = time.perf_counter() + logger.info("[prefill] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + raise RuntimeError( + f"Prefill backend error {resp.status}: {error_text}" + ) + await resp.read() + logger.info( + "[prefill] done request_id=%s status=%s elapsed=%.2fs", + request_id, + resp.status, + time.perf_counter() - start_ts, + ) + except asyncio.TimeoutError as exc: + raise RuntimeError(f"Prefill service timeout at {url}") from exc + except aiohttp.ClientError as exc: + raise RuntimeError(f"Prefill service unavailable at {url}") from exc + + async def _stream_decode( + request_path: str, + payload: dict, + headers: dict[str, str], + request_id: str, + ): + url = f"{DECODE_BASE}{request_path}" + # Stream tokens from the decode service once the prefill stage has + # materialized KV caches on the target workers. + logger.info("[decode] start request_id=%s url=%s", request_id, url) + try: + async with ( + aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session, + session.post(url=url, json=payload, headers=headers) as resp, + ): + if resp.status != 200: + error_text = await resp.text() + logger.error( + "Decode backend error %s - %s", resp.status, error_text + ) + err_msg = ( + '{"error": "Decode backend error ' + str(resp.status) + '"}' + ) + yield err_msg.encode() + return + logger.info( + "[decode] streaming response request_id=%s status=%s", + request_id, + resp.status, + ) + async for chunk_bytes in resp.content.iter_chunked(1024): + yield chunk_bytes + logger.info("[decode] finished streaming request_id=%s", request_id) + except asyncio.TimeoutError: + logger.error("Decode service timeout at %s", url) + yield b'{"error": "Decode service timeout"}' + except aiohttp.ClientError as exc: + logger.error("Decode service error at %s: %s", url, exc) + yield b'{"error": "Decode service unavailable"}' async def process_request(): """Process a single request through prefill and decode stages""" @@ -146,13 +206,27 @@ def main(): # Create prefill request (max_tokens=1) prefill_request = original_request_data.copy() prefill_request["max_tokens"] = 1 + if "max_completion_tokens" in prefill_request: + prefill_request["max_completion_tokens"] = 1 # Execute prefill stage - async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request): - continue + # The request id encodes both KV socket addresses so the backend can + # shuttle tensors directly via NCCL once the prefill response + # completes. + request_id = ( + f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_" + f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}" + ) + + headers = _build_headers(request_id) + await _run_prefill(request.path, prefill_request, headers, request_id) # Execute decode stage and stream response - generator = forward_request(DECODE_SERVICE_URL, original_request_data) + # Pass the unmodified user request so the decode phase can continue + # sampling with the already-populated KV cache. + generator = _stream_decode( + request.path, original_request_data, headers, request_id + ) response = await make_response(generator) response.timeout = None # Disable timeout for streaming response return response @@ -168,23 +242,10 @@ def main(): @app.route("/v1/completions", methods=["POST"]) async def handle_request(): """Handle incoming API requests with concurrency and rate limiting""" - # Create task for request processing - task = asyncio.create_task(process_request()) - - # Enqueue request or reject if queue is full - if not await request_queue.enqueue(task): - return Response( - response=b'{"error": "Server busy, try again later"}', - status=503, - content_type="application/json", - ) - try: - # Return the response from the processing task - return await task + return await process_request() except asyncio.CancelledError: - # Handle task cancellation (timeout or queue full) - logger.warning("Request cancelled due to timeout or queue full") + logger.warning("Request cancelled") return Response( response=b'{"error": "Request cancelled"}', status=503, diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh index d434e22b1ae88..cd2f2e44a4d69 100644 --- a/examples/online_serving/disaggregated_prefill.sh +++ b/examples/online_serving/disaggregated_prefill.sh @@ -24,7 +24,14 @@ cleanup() { exit 0 } -export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + +if [[ -z "${VLLM_HOST_IP:-}" ]]; then + export VLLM_HOST_IP=127.0.0.1 + echo "Using default VLLM_HOST_IP=127.0.0.1 (override by exporting VLLM_HOST_IP before running this script)" +else + echo "Using provided VLLM_HOST_IP=${VLLM_HOST_IP}" +fi + # install quart first -- required for disagg prefill proxy serve if python3 -c "import quart" &> /dev/null; then @@ -38,7 +45,7 @@ fi wait_for_server() { local port=$1 timeout 1200 bash -c " - until curl -s localhost:${port}/v1/completions > /dev/null; do + until curl -i localhost:${port}/v1/models > /dev/null; do sleep 1 done" && return 0 || return 1 } @@ -48,21 +55,23 @@ wait_for_server() { # prefilling instance, which is the KV producer CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ --port 8100 \ --max-model-len 100 \ --gpu-memory-utilization 0.8 \ --trust-remote-code \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' & -# decoding instance, which is the KV consumer +# decoding instance, which is the KV consumer CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ --port 8200 \ --max-model-len 100 \ --gpu-memory-utilization 0.8 \ --trust-remote-code \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":"1e10","kv_port":"14580","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8200","send_type":"PUT_ASYNC"}}' & # wait until prefill and decode instances are ready wait_for_server 8100 From 647464719b131963dccdc3a28cfe52d1af293cda Mon Sep 17 00:00:00 2001 From: Or Ozeri Date: Thu, 20 Nov 2025 20:09:59 +0200 Subject: [PATCH 078/249] [KVConnector][Core] Support cross-layer KV blocks (#27743) Signed-off-by: Or Ozeri --- .../unit/test_offloading_connector.py | 8 +- tests/v1/kv_offload/test_cpu_offloading.py | 145 +++++++++------ tests/v1/worker/test_gpu_model_runner.py | 5 +- vllm/attention/backends/abstract.py | 29 ++- .../kv_transfer/kv_connector/v1/base.py | 33 +++- .../kv_connector/v1/offloading_connector.py | 43 ++++- vllm/v1/attention/backends/flash_attn.py | 12 +- vllm/v1/attention/backends/flashinfer.py | 12 +- vllm/v1/attention/backends/mla/common.py | 9 + vllm/v1/attention/backends/mla/indexer.py | 6 +- vllm/v1/kv_offload/cpu.py | 17 +- vllm/v1/kv_offload/spec.py | 6 +- vllm/v1/kv_offload/worker/cpu_gpu.py | 12 +- vllm/v1/worker/gpu_model_runner.py | 41 ++++- .../worker/kv_connector_model_runner_mixin.py | 165 ++++++++++++++++++ 15 files changed, 453 insertions(+), 90 deletions(-) diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py index 23b6c4802d106..69565f584ab89 100644 --- a/tests/v1/kv_connector/unit/test_offloading_connector.py +++ b/tests/v1/kv_connector/unit/test_offloading_connector.py @@ -19,6 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.offloading_connector import ( ) from vllm.forward_context import ForwardContext from vllm.utils.hashing import sha256 +from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.core.kv_cache_utils import ( BlockHash, get_request_block_hasher, @@ -92,7 +93,7 @@ class MockOffloadingSpec(OffloadingSpec): return self.manager def get_handlers( - self, _ + self, _, __ ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler @@ -138,7 +139,10 @@ class RequestRunner: self.worker_connector = OffloadingConnector(vllm_config, KVConnectorRole.WORKER) # register worker kv_caches to enable OffloadingWorker creations - self.worker_connector.register_kv_caches(kv_caches={"a": torch.empty(0)}) + self.worker_connector.register_cross_layers_kv_cache( + kv_cache=torch.empty(0), + attn_backend=FlashAttentionBackend, + ) # extract connector of scheduler scheduler_connector = self.scheduler.connector diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index b654ea4298dbb..3ee41c40859dc 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -12,8 +12,10 @@ from tqdm import tqdm from vllm import LLM, SamplingParams, TokensPrompt from vllm.config import KVEventsConfig, KVTransferConfig from vllm.distributed.kv_events import BlockStored, KVEventBatch +from vllm.utils.system_utils import set_env_var -CPU_BLOCK_SIZES = [16, 48] +CPU_BLOCK_SIZES = [48] +ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"] class MockSubscriber: @@ -63,8 +65,88 @@ class MockSubscriber: self.sub.close() +def _latency_test(llm: LLM, subscriber: MockSubscriber): + sampling_params = SamplingParams(max_tokens=1) + + num_times_cpu_better_than_cold = 0 + num_tests = 10 + total_cold_time = 0.0 + total_gpu_hit_time = 0.0 + total_cpu_hit_time = 0.0 + prompt_token_ids = [0] * 10001 + for i in tqdm(range(num_tests), desc="Running tests"): + prompt_token_ids[0] = i + prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)] + + # run generation - this should trigger saving KV cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cold_time = time.time() - start_time + total_cold_time += cold_time + + # run generation again - should hit the GPU prefix cache + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + gpu_hit_time = time.time() - start_time + total_gpu_hit_time += gpu_hit_time + + # reset prefix cache to avoid GPU hit. + llm.reset_prefix_cache() + + assert subscriber.get_new_cpu_stored_events() + + # run generation again - this should trigger loading from CPU + start_time = time.time() + llm.generate(prompts, sampling_params, use_tqdm=False) + cpu_hit_time = time.time() - start_time + total_cpu_hit_time += cpu_hit_time + + if cpu_hit_time < cold_time: + num_times_cpu_better_than_cold += 1 + + print("Average times:") + print(f" Cold: {total_cold_time * 1000 / num_tests:.2f}ms") + print(f" GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms") + print(f" CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms") + + assert num_times_cpu_better_than_cold >= 0.8 * num_tests + + +def _accuracy_test(llm: LLM, subscriber: MockSubscriber): + sampling_params = SamplingParams(max_tokens=1) + cpu_block_size = ( + llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[ + "block_size" + ] + ) + + subscriber.get_new_cpu_stored_events() + + # prepend prompt to be cpu block aligned + prompt = "Let's count to 10. One, two, three, four," + while ( + len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size + != 0 + ): + prompt = ". " + prompt + + assert subscriber.get_new_cpu_stored_events() + + test_count = 100 + success_count = 0 + for i in range(test_count): + if ( + llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text + == " five" + ): + success_count += 1 + + assert success_count >= 0.5 * test_count + + @pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES) -def test_cpu_offloading(cpu_block_size: int) -> None: +@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS) +def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None: """ Tests OffloadingConnector with CPUOffloadingSpec. """ @@ -92,61 +174,20 @@ def test_cpu_offloading(cpu_block_size: int) -> None: topic="test", ) - llm = LLM( - model="meta-llama/Llama-3.2-1B-Instruct", - gpu_memory_utilization=0.5, - kv_events_config=kv_events_config, - kv_transfer_config=kv_transfer_config, - ) - - sampling_params = SamplingParams(temperature=0, max_tokens=1) + with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend): + llm = LLM( + model="meta-llama/Llama-3.2-1B-Instruct", + gpu_memory_utilization=0.5, + kv_events_config=kv_events_config, + kv_transfer_config=kv_transfer_config, + ) events_endpoint = events_endpoint.replace("*", "127.0.0.1") subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic) try: - num_times_cpu_better_than_cold = 0 - num_tests = 10 - total_cold_time = 0.0 - total_gpu_hit_time = 0.0 - total_cpu_hit_time = 0.0 - prompt_token_ids = [0] * 10001 - for i in tqdm(range(num_tests), desc="Running tests"): - prompt_token_ids[0] = i - prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)] - - # run generation - this should trigger saving KV cache - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - cold_time = time.time() - start_time - total_cold_time += cold_time - - # run generation again - should hit the GPU prefix cache - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - gpu_hit_time = time.time() - start_time - total_gpu_hit_time += gpu_hit_time - - # reset prefix cache to avoid GPU hit. - llm.reset_prefix_cache() - - assert subscriber.get_new_cpu_stored_events() - - # run generation again - this should trigger loading from CPU - start_time = time.time() - llm.generate(prompts, sampling_params, use_tqdm=False) - cpu_hit_time = time.time() - start_time - total_cpu_hit_time += cpu_hit_time - - if cpu_hit_time < cold_time: - num_times_cpu_better_than_cold += 1 - - print("Average times:") - print(f" Cold: {total_cold_time * 1000 / num_tests:.2f}ms") - print(f" GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms") - print(f" CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms") - - assert num_times_cpu_better_than_cold >= 0.8 * num_tests + _latency_test(llm, subscriber) + _accuracy_test(llm, subscriber) finally: subscriber.close() del llm diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 824e458978350..01c1364f7ee62 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -483,7 +483,10 @@ def test_kv_cache_stride_order(monkeypatch, model_runner): # Permutation that gets you back to expected kv shape for test_stride in ((1, 4, 0, 2, 3), (0, 1, 2, 3, 4)): - def rnd_stride_order(test_stride=test_stride): + def rnd_stride_order( + include_num_layers_dimension: bool = False, test_stride=test_stride + ): + assert not include_num_layers_dimension return test_stride # Patch the attention backend class and re-trigger the KV cache creation diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 188becb6ad6f0..67ded88475243 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -76,7 +76,34 @@ class AttentionBackend(ABC): raise NotImplementedError @staticmethod - def get_kv_cache_stride_order() -> tuple[int, ...]: + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: + """ + Get the physical (memory layout) ordering of the kv cache dimensions. + e.g. if the KV cache shape is + [2, num_blocks, block_size, num_heads, head_size], + and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical + ordering of dimensions is + [num_blocks, num_heads, 2, block_size, head_size]. + + If this function is unimplemented / raises NotImplementedError, + the physical layout of the KV cache will match the logical shape. + + Args: + include_num_layers_dimension: if True, includes an additional + num_layers dimension, which is assumed to be prepended + to the logical KV cache shape. + With the above example, a return value (2, 4, 0, 1, 3, 5) + corresponds to + [num_blocks, num_heads, num_layers, 2, block_size, head_size]. + + If an additional dimension is NOT included in the returned + tuple, the physical layout will not include a layers dimension. + + Returns: + A tuple of ints which is a permutation of range(len(shape)). + """ raise NotImplementedError @classmethod diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index f85eb414b2222..74f09278b7bb1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -38,7 +38,7 @@ The class provides the following primitives: import enum from abc import ABC, abstractmethod from collections.abc import Callable, Iterable -from typing import TYPE_CHECKING, Any, Literal, Optional +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional import torch @@ -47,7 +47,7 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: - from vllm.attention.backends.abstract import AttentionMetadata + from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata from vllm.config import VllmConfig from vllm.distributed.kv_events import KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( @@ -142,6 +142,18 @@ class KVConnectorMetadata(ABC): # noqa: B024 class KVConnectorBase_V1(ABC): + """ + Base class for KV connectors. + + Attributes: + prefer_cross_layer_blocks (bool): Indicates whether this connector + prefers KV blocks that hold KV data for all layers (for speeding + up KV data transfers). + Defaults to False. + """ + + prefer_cross_layer_blocks: ClassVar[bool] = False + def __init__( self, vllm_config: "VllmConfig", @@ -226,6 +238,23 @@ class KVConnectorBase_V1(ABC): """ return + def register_cross_layers_kv_cache( + self, kv_cache: torch.Tensor, attn_backend: type["AttentionBackend"] + ): + """ + Initialize with a single KV cache tensor used by all layers. + The first dimension should be num_layers. + This function will only be called for models with uniform layers, + and only if the prefers_cross_layer_blocks is set to True. + Only one of the functions + {register_kv_caches, register_cross_layers_kv_cache} will be called. + + Args: + kv_cache: a cross-layers kv cache tensor + attn_backend: The attention backend that corresponds to all layers + """ + return + def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp): """ Set the xPU-specific ops for copying KV between host and device. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py index 582e42cc466ae..8cd09014cab11 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py @@ -4,12 +4,12 @@ from collections import defaultdict from collections.abc import Iterable, Iterator from dataclasses import dataclass from itertools import islice -from typing import Any +from typing import Any, ClassVar import torch -from vllm.attention import AttentionMetadata -from vllm.config import VllmConfig +from vllm.attention import Attention, AttentionBackend, AttentionMetadata +from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, @@ -42,6 +42,8 @@ class OffloadingConnectorMetadata(KVConnectorMetadata): class OffloadingConnector(KVConnectorBase_V1): + prefer_cross_layer_blocks: ClassVar[bool] = True + def __init__( self, vllm_config: VllmConfig, @@ -63,6 +65,12 @@ class OffloadingConnector(KVConnectorBase_V1): assert self.connector_worker is not None self.connector_worker.register_kv_caches(kv_caches) + def register_cross_layers_kv_cache( + self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend] + ): + assert self.connector_worker is not None + self.connector_worker.register_cross_layers_kv_cache(kv_cache, attn_backend) + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: assert self.connector_worker is not None assert isinstance(self._connector_metadata, OffloadingConnectorMetadata) @@ -422,10 +430,35 @@ class OffloadingConnectorWorker: self._job_counter = job_id + 1 return job_id - def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): - for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches): + def _register_handlers( + self, + kv_caches: dict[str, torch.Tensor], + attn_backends: dict[str, type[AttentionBackend]], + ): + for src_cls, dst_cls, handler in self.spec.get_handlers( + kv_caches, attn_backends + ): self.worker.register_handler(src_cls, dst_cls, handler) + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + layer_names = list(kv_caches.keys()) + layers = get_layers_from_vllm_config( + self.spec.vllm_config, Attention, layer_names + ) + attn_backends = { + layer_name: layers[layer_name].get_attn_backend() + for layer_name in layer_names + } + self._register_handlers(kv_caches, attn_backends) + + def register_cross_layers_kv_cache( + self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend] + ): + cross_layer_name = "ALL_LAYERS" + kv_caches = {cross_layer_name: kv_cache} + attn_backends = {cross_layer_name: attn_backend} + self._register_handlers(kv_caches, attn_backends) + def start_load_kv(self, metadata: OffloadingConnectorMetadata): for req_id, transfer_spec in metadata.reqs_to_load.items(): job_id = self._generate_job_id() diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index cf3c1d05f5b3f..9fa6b1dfd19dd 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -99,12 +99,20 @@ class FlashAttentionBackend(AttentionBackend): return (2, num_blocks, block_size, num_kv_heads, head_size) @staticmethod - def get_kv_cache_stride_order() -> tuple[int, ...]: + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: # `stride_order` indicates the permutation that gets # us from `get_kv_cache_shape` to the actual memory layout we want. cache_layout = get_kv_cache_layout() - if cache_layout == "NHD": + if cache_layout == "NHD" and include_num_layers_dimension: + # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size) + return (2, 0, 1, 3, 4, 5) + elif cache_layout == "NHD": stride_order = (0, 1, 2, 3, 4) + elif cache_layout == "HND" and include_num_layers_dimension: + # (num_blocks, num_kv_heads, num_layers, 2, block_size, head_size) + return (2, 4, 0, 1, 3, 5) elif cache_layout == "HND": stride_order = (0, 1, 3, 2, 4) else: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 4da1637d96eb6..3ad7e8c52fc1f 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -309,12 +309,20 @@ class FlashInferBackend(AttentionBackend): return (num_blocks, 2, block_size, num_kv_heads, head_size) @staticmethod - def get_kv_cache_stride_order() -> tuple[int, ...]: + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: # `stride_order` indicates the permutation that gets us from # `get_kv_cache_shape` to the actual memory layout we want. cache_layout = get_kv_cache_layout() - if cache_layout == "NHD": + if cache_layout == "NHD" and include_num_layers_dimension: + # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size) + return (1, 0, 2, 3, 4, 5) + elif cache_layout == "NHD": stride_order = (0, 1, 2, 3, 4) + elif cache_layout == "HND" and include_num_layers_dimension: + # (num_blocks, 2, num_kv_heads, num_layers, block_size, head_size) + return (1, 2, 4, 0, 3, 5) elif cache_layout == "HND": stride_order = (0, 1, 3, 2, 4) else: diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 32f406980f2ed..43aef8a7cca91 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -308,6 +308,15 @@ class MLACommonBackend(AttentionBackend): ) -> tuple[int, ...]: return (num_blocks, block_size, head_size) + @staticmethod + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: + # `stride_order` indicates the permutation that gets + # us from `get_kv_cache_shape` to the actual memory layout we want. + # (num_blocks, num_layers, block_size, head_size) + return (1, 0, 2, 3) if include_num_layers_dimension else (0, 1, 2) + @classmethod def get_supported_head_sizes(cls) -> list[int]: return [576] diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index cc0988435768c..d38361e0fcbf8 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -48,7 +48,11 @@ class DeepseekV32IndexerBackend(AttentionBackend): return (num_blocks, block_size, head_size) @staticmethod - def get_kv_cache_stride_order() -> tuple[int, ...]: + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: + if include_num_layers_dimension: + return (0, 1, 2, 3) return (0, 1, 2) diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py index 4b1bbe6f0cc2a..86747299eb107 100644 --- a/vllm/v1/kv_offload/cpu.py +++ b/vllm/v1/kv_offload/cpu.py @@ -4,8 +4,8 @@ from collections.abc import Iterator import torch -from vllm.config import VllmConfig, get_layers_from_vllm_config -from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.attention import AttentionBackend +from vllm.config import VllmConfig from vllm.platforms import current_platform from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager @@ -63,7 +63,9 @@ class CPUOffloadingSpec(OffloadingSpec): return self._manager def get_handlers( - self, kv_caches: dict[str, torch.Tensor] + self, + kv_caches: dict[str, torch.Tensor], + attn_backends: dict[str, type[AttentionBackend]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handler: if not current_platform.is_cuda_alike(): @@ -71,15 +73,6 @@ class CPUOffloadingSpec(OffloadingSpec): "CPU Offloading is currently only supported on CUDA-alike GPUs" ) - layer_names = list(kv_caches.keys()) - layers = get_layers_from_vllm_config( - self.vllm_config, AttentionLayerBase, layer_names - ) - attn_backends = { - layer_name: layers[layer_name].get_attn_backend() - for layer_name in layer_names - } - self._handler = CpuGpuOffloadingHandler( attn_backends=attn_backends, gpu_block_size=self.gpu_block_size, diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py index a3c539a47d458..c1813a4ff4ea9 100644 --- a/vllm/v1/kv_offload/spec.py +++ b/vllm/v1/kv_offload/spec.py @@ -11,6 +11,7 @@ from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager from vllm.v1.kv_offload.worker.worker import OffloadingHandler if TYPE_CHECKING: + from vllm.attention import AttentionBackend from vllm.config import VllmConfig logger = init_logger(__name__) @@ -48,13 +49,16 @@ class OffloadingSpec(ABC): @abstractmethod def get_handlers( - self, kv_caches: dict[str, torch.Tensor] + self, + kv_caches: dict[str, torch.Tensor], + attn_backends: dict[str, type["AttentionBackend"]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: """ Get offloading handlers along with their respective src and dst types. Args: kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor. + attn_backends: A dictionary of layer_name -> AttentionBackend. Yields: Tuples of (src_type, dst_type, offloading_handler). diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py index 111046377a5da..bb163f0043fc6 100644 --- a/vllm/v1/kv_offload/worker/cpu_gpu.py +++ b/vllm/v1/kv_offload/worker/cpu_gpu.py @@ -83,10 +83,18 @@ class CpuGpuOffloadingHandler(OffloadingHandler): self.gpu_tensors.append(gpu_tensor) gpu_shape = gpu_tensor.shape - test_shape = attn_backends[layer_name].get_kv_cache_shape( + attn_backend = attn_backends[layer_name] + test_shape = attn_backend.get_kv_cache_shape( num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256 ) - if test_shape[0] == 1234: + + if len(gpu_shape) != len(test_shape): + # cross-layers tensor + # shape is (num_blocks, ...) + assert len(gpu_shape) == len(test_shape) + 1 + num_blocks_idx = 0 + self.kv_dim_before_num_blocks.append(False) + elif test_shape[0] == 1234: # shape is (num_blocks, ...) num_blocks_idx = 0 self.kv_dim_before_num_blocks.append(False) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0490ed39c8c78..4b0a08ab57e16 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -349,6 +349,9 @@ class GPUModelRunner( # self.model: nn.Module # Set after load_model # Initialize in initialize_kv_cache self.kv_caches: list[torch.Tensor] = [] + # Initialize in initialize_kv_cache_tensors + self.cross_layers_kv_cache: torch.Tensor | None = None + self.cross_layers_attn_backend: type[AttentionBackend] | None = None # indexes: [kv_cache_group_id][attn_group] self.attn_groups: list[list[AttentionGroup]] = [] # self.kv_cache_config: KVCacheConfig @@ -4930,12 +4933,30 @@ class GPUModelRunner( Dict[str, torch.Tensor]: A map between layer names to their corresponding memory buffer for KV cache. """ - # Initialize the memory buffer for KV cache - kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config) - # Change the memory buffer to the desired shape - kv_caches = self._reshape_kv_cache_tensors( - kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes - ) + + # Try creating KV caches optimized for kv-connector transfers + cache_dtype = self.cache_config.cache_dtype + if self.use_uniform_kv_cache(self.attn_groups, cache_dtype): + kv_caches, cross_layers_kv_cache, attn_backend = ( + self.allocate_uniform_kv_caches( + kv_cache_config, + self.attn_groups, + cache_dtype, + self.device, + kernel_block_sizes, + ) + ) + self.cross_layers_kv_cache = cross_layers_kv_cache + self.cross_layers_attn_backend = attn_backend + else: + # Fallback to the general case + # Initialize the memory buffer for KV cache + kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config) + + # Change the memory buffer to the desired shape + kv_caches = self._reshape_kv_cache_tensors( + kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes + ) # Set up cross-layer KV cache sharing for layer_name, target_layer_name in self.shared_kv_cache_layers.items(): @@ -5017,7 +5038,13 @@ class GPUModelRunner( if has_kv_transfer_group(): kv_transfer_group = get_kv_transfer_group() - kv_transfer_group.register_kv_caches(kv_caches) + if self.cross_layers_kv_cache is not None: + assert self.cross_layers_attn_backend is not None + kv_transfer_group.register_cross_layers_kv_cache( + self.cross_layers_kv_cache, self.cross_layers_attn_backend + ) + else: + kv_transfer_group.register_kv_caches(kv_caches) kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) if self.dcp_world_size > 1: diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index db037a9fccd5c..e59361f21372a 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -11,7 +11,11 @@ from typing import ( TYPE_CHECKING, # noqa: UP035 ) +import torch + +from vllm.attention import AttentionBackend from vllm.config import VllmConfig +from vllm.config.cache import CacheDType from vllm.distributed.kv_transfer import ( ensure_kv_transfer_shutdown, get_kv_transfer_group, @@ -21,11 +25,13 @@ from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger +from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig from vllm.v1.outputs import ( EMPTY_MODEL_RUNNER_OUTPUT, KVConnectorOutput, ModelRunnerOutput, ) +from vllm.v1.worker.utils import AttentionGroup if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -142,3 +148,162 @@ class KVConnectorModelRunnerMixin: if has_kv_transfer_group(): return get_kv_transfer_group().get_kv_connector_stats() return None + + @staticmethod + def use_uniform_kv_cache( + attn_groups: list[list[AttentionGroup]], + cache_dtype: CacheDType, + ) -> bool: + """ + Determines whether a uniform KV layout should be used. + A uniform layout means all layers KV caches will share the same + underlying tensor, where for a given block number, the respective + KV data for all layers will be contiguous. + This will allow efficient KV transfer of per-block KV data for all + layers at once. + Note this layout will only be applied given 3 conditions: + 1. The KV Cache config contains just a single group where all layers + have the same page size. + 2. A KV connector is configured, and the KV connector instance prefers + to use this layout (prefer_cross_layer_blocks() returns True) + 2. The flash attention backend supports this layout + (get_kv_cache_stride_order(True) includes a placement for a + num_layers dimension) + + Note that the actual placement of the num_layers dimensions + in the unified layers tensors will be determined by the attention + backend. + Thus, the layers KV data may still not be contiguous per block + if the attention backend does not support it. + + Args: + attn_groups: The list of attention groups for this model + cache_dtype: The KV cache dtype + Returns: + True if we should use a uniform KV cache layout. + """ + + if not has_kv_transfer_group(): + return False + if not get_kv_transfer_group().prefer_cross_layer_blocks: + return False + + if len(attn_groups) != 1 or len(attn_groups[0]) != 1: + return False + + attn_group = attn_groups[0][0] + kv_cache_spec = attn_group.kv_cache_spec + if not isinstance(kv_cache_spec, AttentionSpec): + return False + + attn_backend = attn_group.backend + kv_cache_shape = attn_backend.get_kv_cache_shape( + 1234, + kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size, + cache_dtype_str=cache_dtype, + ) + + try: + kv_cache_stride_order = attn_backend.get_kv_cache_stride_order( + include_num_layers_dimension=True + ) + except (AttributeError, NotImplementedError): + return False + + # check that attention backend include a layers dimension + return len(kv_cache_stride_order) == len(kv_cache_shape) + 1 + + @staticmethod + def allocate_uniform_kv_caches( + kv_cache_config: KVCacheConfig, + attn_groups: list[list[AttentionGroup]], + cache_dtype: CacheDType, + device: torch.device, + kernel_block_sizes: list[int], + ) -> tuple[dict[str, torch.Tensor], torch.Tensor, type[AttentionBackend]]: + """ + Initializes and reshapes KV caches for the simple case where all + layers have the same layout. + + This function assumes use_uniform_kv_cache() returned True. + + Args: + kv_cache_config: The KV cache config + attn_groups: The list of attention groups for this model + cache_dtype: The KV cache dtype + device: The torch device to allocate on. + kernel_block_sizes: The kernel block sizes for each KV cache group. + Returns: + A tuple (kv_caches, cross_layers_kv_cache, attn_backend) where: + kv_caches is a dict mapping between layer names to their + corresponding memory buffer for KV cache. + cross_layers_kv_cache is the cross layers kv cache tensor + attn_backend is the attention backend matching this tensor + """ + attn_group = attn_groups[0][0] + kv_cache_spec = attn_group.kv_cache_spec + assert isinstance(kv_cache_spec, AttentionSpec) + + tensor_sizes = set( + kv_cache_tensor.size for kv_cache_tensor in kv_cache_config.kv_cache_tensors + ) + assert len(tensor_sizes) == 1 + tensor_size = tensor_sizes.pop() + + page_size = kv_cache_spec.page_size_bytes + assert tensor_size % page_size == 0 + num_blocks = tensor_size // page_size + num_layers = len(kv_cache_config.kv_cache_tensors) + total_size = tensor_size * num_layers + + assert len(kernel_block_sizes) == 1 + kernel_block_size = kernel_block_sizes[0] + num_blocks_per_kv_block = kv_cache_spec.block_size // kernel_block_size + kernel_num_blocks = num_blocks * num_blocks_per_kv_block + + attn_backend = attn_group.backend + kv_cache_shape = attn_backend.get_kv_cache_shape( + kernel_num_blocks, + kernel_block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size, + cache_dtype_str=cache_dtype, + ) + + # prepend a num_layers dimension into the shape + kv_cache_shape = (num_layers,) + kv_cache_shape + + try: + kv_cache_stride_order = attn_backend.get_kv_cache_stride_order( + include_num_layers_dimension=True + ) + assert len(kv_cache_stride_order) == len(kv_cache_shape) + except (AttributeError, NotImplementedError): + kv_cache_stride_order = tuple(range(len(kv_cache_shape))) + + kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order) + + logger.info("Allocating a cross layer KV cache of shape %s", kv_cache_shape) + + # allocate one contiguous buffer for all layers + cross_layers_kv_cache = ( + torch.zeros(total_size, dtype=torch.int8, device=device) + .view(kv_cache_spec.dtype) + .view(kv_cache_shape) + ) + + # Maintain original KV shape view. + inv_order = [ + kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order)) + ] + permuted_kv_cache = cross_layers_kv_cache.permute(*inv_order) + + kv_caches = {} + for i, kv_cache_tensor in enumerate(kv_cache_config.kv_cache_tensors): + tensor = permuted_kv_cache[i] + for layer_name in kv_cache_tensor.shared_by: + kv_caches[layer_name] = tensor + + return kv_caches, cross_layers_kv_cache, attn_backend From 114b0e25004b7e7cf0a23dc65f407471bd5de7e8 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Thu, 20 Nov 2025 10:22:40 -0800 Subject: [PATCH 079/249] [chore] Update annotate release scripts (#29077) Signed-off-by: Kevin H. Luu --- .buildkite/scripts/annotate-release.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index 56bb5cedaa0a9..df805e0850806 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -23,8 +23,8 @@ To download the wheel (by version): aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . -aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . +aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl . \`\`\` To download and upload the image: @@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker push vllm/vllm-openai:latest-aarch64 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 -docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend -docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend +docker manifest rm vllm/vllm-openai:latest +docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 +docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 docker manifest push vllm/vllm-openai:latest docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} \`\`\` -EOF \ No newline at end of file +EOF From 4d01b6428448225807e6605d04e37e29fe729b44 Mon Sep 17 00:00:00 2001 From: Software Developer <7852635+dsuhinin@users.noreply.github.com> Date: Thu, 20 Nov 2025 21:00:33 +0100 Subject: [PATCH 080/249] [Bugfix] - Add Trace Headers to Beam Search Path (#29100) Signed-off-by: dsuhinin --- vllm/entrypoints/openai/serving_chat.py | 1 + vllm/entrypoints/openai/serving_completion.py | 1 + vllm/entrypoints/openai/serving_engine.py | 2 ++ 3 files changed, 4 insertions(+) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 59e1c8d531793..6cc685acd6728 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -319,6 +319,7 @@ class OpenAIServingChat(OpenAIServing): request_id=request_id, params=sampling_params, lora_request=lora_request, + trace_headers=trace_headers, ) else: engine_request, tokenization_kwargs = await self._process_inputs( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a114b77ebc16b..1cfb45ef40366 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -216,6 +216,7 @@ class OpenAIServingCompletion(OpenAIServing): request_id=request_id, params=sampling_params, lora_request=lora_request, + trace_headers=trace_headers, ) else: engine_request, tokenization_kwargs = await self._process_inputs( diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 127b8e6dcb87c..7dab5dbacd28c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -343,6 +343,7 @@ class OpenAIServing: request_id: str, params: BeamSearchParams, lora_request: LoRARequest | None = None, + trace_headers: Mapping[str, str] | None = None, ) -> AsyncGenerator[RequestOutput, None]: beam_width = params.beam_width max_tokens = params.max_tokens @@ -437,6 +438,7 @@ class OpenAIServing: beam_search_params, request_id_item, lora_request=lora_req, + trace_headers=trace_headers, ) ) ) From 3d84ef9054af190ce68333be3e4d16fe928be754 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 14:39:49 -0600 Subject: [PATCH 081/249] [CI/Build][AMD] Skip if flash_attn_varlen_func not available in test_aiter_flash_attn.py (#29043) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/attention/test_aiter_flash_attn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index 1dec46e33f22e..8f58c470d217a 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -6,6 +6,7 @@ import pytest import torch import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 +from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.platforms import current_platform NUM_HEADS = [(4, 4), (8, 2)] @@ -100,6 +101,8 @@ def test_varlen_with_paged_kv( num_blocks: int, q_dtype: torch.dtype | None, ) -> None: + if not is_flash_attn_varlen_func_available(): + pytest.skip("flash_attn_varlen_func required to run this test.") torch.set_default_device("cuda") current_platform.seed_everything(0) num_seqs = len(seq_lens) From 5e5a7eb16f121f05e19c8bdf88247744ab9d1b83 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 14:45:56 -0600 Subject: [PATCH 082/249] [CI/Build] Make test_attention_selector.py run tests on correct platform (#29064) Signed-off-by: Randall Smith Signed-off-by: rasmith Co-authored-by: Randall Smith Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/kernels/attention/test_attention_selector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 3b8e939300a27..9be56a33f76c8 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -7,6 +7,7 @@ import pytest import torch from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend +from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.rocm import RocmPlatform @@ -47,9 +48,11 @@ DEVICE_MLA_BLOCK_SIZES = { def generate_params(): + is_rocm = current_platform.is_rocm() params = [] + device_list = ["cuda", "cpu"] if not is_rocm else ["hip", "cpu"] for use_mla in [True, False]: - for device in ["cuda", "hip", "cpu"]: + for device in device_list: backends = ( DEVICE_MLA_BACKENDS[device] if use_mla From 3fd74189db13c9793325d9a36539d891873d1ae4 Mon Sep 17 00:00:00 2001 From: Driss Guessous <32754868+drisspg@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:21:54 -0800 Subject: [PATCH 083/249] Fixes bench (#29058) Signed-off-by: drisspg --- vllm/compilation/caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index 16e34c2711e9f..63b7ad7279e37 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -12,6 +12,7 @@ from torch.utils import _pytree as pytree import vllm.envs as envs from vllm.config import VllmConfig, get_current_vllm_config +from vllm.config.utils import hash_factors from vllm.logger import init_logger try: @@ -138,7 +139,7 @@ def compilation_config_hash_factors(vllm_config: VllmConfig) -> list[str]: factors = [] # 0. factors come from the env, for example, The values of # VLLM_PP_LAYER_PARTITION will affect the computation graph. - env_hash = envs.compute_hash() + env_hash = hash_factors(envs.compile_factors()) factors.append(env_hash) # 1. factors come from the vllm_config (it mainly summarizes how the From 8237ab8a2bed14bec5cafbec75033c8e1d54d852 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 15:35:14 -0600 Subject: [PATCH 084/249] [CI/Build] Skip lm-format-enforcer tests in test_struct_output_generate.py for now (#29021) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- .../llm/test_struct_output_generate.py | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index a7d769c8542a9..316e152e7395c 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -47,10 +47,34 @@ EAGLE_SPEC_CONFIG = { PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None), ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), - ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None), + pytest.param( + "mistralai/Ministral-8B-Instruct-2410", + "lm-format-enforcer", + "auto", + None, + marks=pytest.mark.skip( + reason=( + "Flaky: lm-format-enforcer intermittently returns" + "incomplete JSON." + "See https://github.com/noamgat/lm-format-enforcer/issues/169" + ) + ), + ), ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None), ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None), - ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None), + pytest.param( + "Qwen/Qwen2.5-1.5B-Instruct", + "lm-format-enforcer", + "auto", + None, + marks=pytest.mark.skip( + reason=( + "Flaky: lm-format-enforcer intermittently returns" + "incomplete JSON." + "See https://github.com/noamgat/lm-format-enforcer/issues/169" + ) + ), + ), # FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402 # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None), # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None), From c7a29d2c8d07ce6188d0c4bb19df6fd1d0e9bc74 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 20 Nov 2025 15:44:37 -0600 Subject: [PATCH 085/249] [CI/Build] Remove skip global cleanup in test_struct_output_generate.py (#29022) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/v1/entrypoints/llm/test_struct_output_generate.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 316e152e7395c..a00600b87eca1 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -121,7 +121,6 @@ def test_guided_decoding_deprecated(): assert sp1.structured_outputs == guided_decoding -@pytest.mark.skip_global_cleanup @pytest.mark.parametrize( "model_name, backend, tokenizer_mode, speculative_config", PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, @@ -626,7 +625,6 @@ Make the response as short as possible. ) -@pytest.mark.skip_global_cleanup @pytest.mark.parametrize( "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 [ @@ -711,7 +709,6 @@ def test_structured_output_with_reasoning_matrices( jsonschema.validate(instance=output_json, schema=reasoning_schema) -@pytest.mark.skip_global_cleanup @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) def test_structured_output_auto_mode( unsupported_json_schema: dict[str, Any], @@ -758,7 +755,6 @@ def test_structured_output_auto_mode( assert isinstance(parsed_json, dict) -@pytest.mark.skip_global_cleanup def test_guidance_no_additional_properties(): llm = LLM( model="Qwen/Qwen2.5-1.5B-Instruct", From dd39f91edb0588e2dd77eb55c758eb1e35907af8 Mon Sep 17 00:00:00 2001 From: Rob Mulla Date: Thu, 20 Nov 2025 19:05:59 -0500 Subject: [PATCH 086/249] [Doc] cleanup TPU documentation and remove outdated examples (#29048) Signed-off-by: Rob Mulla Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/.nav.yml | 6 +- docs/configuration/tpu.md | 111 ------------------ docs/features/README.md | 37 +++--- docs/features/quantization/README.md | 29 +++-- docs/models/hardware_supported_models/tpu.md | 34 ------ .../offline_inference/profiling_tpu/README.md | 70 ----------- .../profiling_tpu/profiling.py | 110 ----------------- examples/offline_inference/tpu.py | 58 --------- 8 files changed, 40 insertions(+), 415 deletions(-) delete mode 100644 docs/configuration/tpu.md delete mode 100644 docs/models/hardware_supported_models/tpu.md delete mode 100644 examples/offline_inference/profiling_tpu/README.md delete mode 100644 examples/offline_inference/profiling_tpu/profiling.py delete mode 100644 examples/offline_inference/tpu.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 3151ea0e2ec22..c8bf00efb2370 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -24,14 +24,16 @@ nav: - deployment/integrations - Training: training - Configuration: - - configuration/README.md - configuration/* + - TPU: https://docs.vllm.ai/projects/tpu/en/latest/ - Models: - models/supported_models.md - models/generative_models.md - models/pooling_models.md - models/extensions - - Hardware Supported Models: models/hardware_supported_models + - Hardware Supported Models: + - models/hardware_supported_models/* + - TPU: https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/ - Features: features - Developer Guide: - contributing/README.md diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md deleted file mode 100644 index 2d24c9c6e2e95..0000000000000 --- a/docs/configuration/tpu.md +++ /dev/null @@ -1,111 +0,0 @@ -# TPU Optimization Tips - -This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload. - -## Get started - -Looking for setup and installation instructions? Find them [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/). - -### TPU workload sizing - -When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed. - -The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you: - -- KV cache size requirement per token and per request -- TPU/GPU memory consumed by the model weights -- TPU/GPU memory allocated for the KV cache -- Maximum \# of requests you can approximately set (--max-num-seqs) - -This approach serves as a general rule of thumb. - -#### Latency-throughput tradeoff - -As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency. - -`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request. - -Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload. - -In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput. - -#### Compilation and Caching - -Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process. - -To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used. - -Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs. - -Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling). - -#### Reducing compilation time - -This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`. - -### Optimize based on your data - -#### max-model-len vs. most-model-len - -![most_model_len](../assets/design/tpu/most_model_len.png) - -If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable. - -For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`. - -The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time. - -#### Padding - -For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.) - -The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests: - -1. the default exponential padding (pad to the nearest power of 2) -2. bucket padding (pad to the nearest linearly increasing bucket). - -When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`. - -For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]. - -The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320. - -However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding. - -#### Quantization - -If possible, use the precision that matches the chip’s hardware acceleration: - -- v5e has int4/int8 hardware acceleration in the MXU -- v6e has int4/int8 hardware acceleration in the MXU - -Supported quantized formats and features in vLLM on TPU [Jul '25]: - -- INT8 W8A8 -- INT8 W8A16 -- FP8 KV cache -- [WIP] FP8 W8A8 -- [WIP] AWQ -- [WIP] FP4 W4A8 - -#### Parallelization - -Don't set TP to be less than the number of chips on a single-host deployment. - -Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types). - -### Tune your workloads - -Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case. - -### Future Topics We'll Cover - -#### Profiling - -The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance. - -#### SPMD - -More details to come. - -**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.** diff --git a/docs/features/README.md b/docs/features/README.md index ad9de9ff8f368..5faf3768f3214 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -59,20 +59,23 @@ th:not(:first-child) { ### Feature x Hardware -| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU | Intel GPU | -|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------| -| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | -| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970) | -| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | -| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965) | -| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | -| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ❌ | ✅ | -| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅ | +| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | Intel GPU | +|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------| +| [CP](../configuration/optimization.md#chunked-prefill) | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [APC](automatic_prefix_caching.md) | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26963) | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) | +| [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | +| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26965) | +| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | +| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅ | ✅ | +| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | + +!!! note + For information on feature support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation. diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 74f005c496ee5..7b5287bad3bb8 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -43,24 +43,27 @@ th:not(:first-child) { } -| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | Google TPU | -|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------| -| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | -| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | ❌ | -| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | -| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | -| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | -| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | ❌ | +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | Intel Gaudi | x86 CPU | +|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------| +| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | +| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ❌ | ✅︎ | +| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ✅︎ | +| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | +| BitBLAS | ✅︎ | ✅ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| BitBLAS (GPTQ) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| DeepSpeedFP | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| GGUF | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | +| INC (W8A8) | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅︎ | ❌ | - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - ✅︎ indicates that the quantization method is supported on the specified hardware. - ❌ indicates that the quantization method is not supported on the specified hardware. +!!! note + For information on quantization support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation. + !!! note This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md deleted file mode 100644 index 7b0a5ba6e72da..0000000000000 --- a/docs/models/hardware_supported_models/tpu.md +++ /dev/null @@ -1,34 +0,0 @@ -# TPU - -## Supported Models - -### Text-only Language Models - -| Model | Architecture | Supported | -|-----------------------------------------------------|--------------------------------|-----------| -| mistralai/Mixtral-8x7B-Instruct-v0.1 | MixtralForCausalLM | 🟨 | -| mistralai/Mistral-Small-24B-Instruct-2501 | MistralForCausalLM | ✅ | -| mistralai/Codestral-22B-v0.1 | MistralForCausalLM | ✅ | -| mistralai/Mixtral-8x22B-Instruct-v0.1 | MixtralForCausalLM | ❌ | -| meta-llama/Llama-3.3-70B-Instruct | LlamaForCausalLM | ✅ | -| meta-llama/Llama-3.1-8B-Instruct | LlamaForCausalLM | ✅ | -| meta-llama/Llama-3.1-70B-Instruct | LlamaForCausalLM | ✅ | -| meta-llama/Llama-4-* | Llama4ForConditionalGeneration | ❌ | -| microsoft/Phi-3-mini-128k-instruct | Phi3ForCausalLM | 🟨 | -| microsoft/phi-4 | Phi3ForCausalLM | ❌ | -| google/gemma-3-27b-it | Gemma3ForConditionalGeneration | 🟨 | -| google/gemma-3-4b-it | Gemma3ForConditionalGeneration | ❌ | -| deepseek-ai/DeepSeek-R1 | DeepseekV3ForCausalLM | ❌ | -| deepseek-ai/DeepSeek-V3 | DeepseekV3ForCausalLM | ❌ | -| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | LlamaForCausalLM | ✅ | -| RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 | LlamaForCausalLM | ✅ | -| Qwen/Qwen3-8B | Qwen3ForCausalLM | ✅ | -| Qwen/Qwen3-32B | Qwen3ForCausalLM | ✅ | -| Qwen/Qwen2.5-7B-Instruct | Qwen2ForCausalLM | ✅ | -| Qwen/Qwen2.5-32B | Qwen2ForCausalLM | ✅ | -| Qwen/Qwen2.5-14B-Instruct | Qwen2ForCausalLM | ✅ | -| Qwen/Qwen2.5-1.5B-Instruct | Qwen2ForCausalLM | 🟨 | - -✅ Runs and optimized. -🟨 Runs and correct but not optimized to green yet. -❌ Does not pass accuracy test or does not run. diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md deleted file mode 100644 index 8c9c1c92b6764..0000000000000 --- a/examples/offline_inference/profiling_tpu/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# vLLM TPU Profiling - -This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes. - -Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes. - -We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html). - -> In all examples below, we run several warmups before (so `--enforce-eager` is okay) - -## Profile Examples - -### Generate Prefill Trace - -This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations. - -```bash -export XLA_HLO_DEBUG=1 -export MODEL=Qwen/Qwen2.5-7B-Instruct -export VLLM_TPU_PROFILE_DURATION_MS=3000 -export VLLM_TPU_PROFILE_DELAY_MS=0 - -python3 profiling.py \ - --model $MODEL \ - --input-len 1024 --output-len 1 \ - --batch-size 1 --enforce-eager \ - --max-model-len 2048 \ - --tensor-parallel-size 1 \ - --profile-result-dir profiles -``` - -### Generate Decode Trace - -This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill). - -```bash -export XLA_HLO_DEBUG=1 -export MODEL=meta-llama/Llama-3.1-70B-Instruct -export VLLM_TPU_PROFILE_DURATION_MS=2000 -export VLLM_TPU_PROFILE_DELAY_MS=1000 - -rm -rf ~/.cache/vllm/xla_cache -python3 profiling.py \ - --model $MODEL \ - --input-len 1 \ - --output-len 128 \ - --batch-size 32 \ - --enforce-eager \ - --profile-result-dir profiles \ - --max-model-len 2048 --tensor-parallel-size 8 -``` - -## Visualizing the profiles - -Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm). - -Here are most likely the dependencies you need to install: - -```bash -pip install tensorflow-cpu \ - tensorboard-plugin-profile \ - etils \ - importlib_resources -``` - -Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser: - -```bash -tensorboard --logdir profiles/ --port 6006 -``` diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py deleted file mode 100644 index 3b127e4fd29df..0000000000000 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ /dev/null @@ -1,110 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import dataclasses -import os -import time - -import numpy as np -import torch_xla.debug.profiler as xp -from tqdm import tqdm - -from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.inputs import PromptType -from vllm.utils.argparse_utils import FlexibleArgumentParser - -DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000)) -DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0)) - - -def main(args: argparse.Namespace): - print(args) - - engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**dataclasses.asdict(engine_args)) - server = xp.start_server(9012) # noqa: F841 - - sampling_params = SamplingParams( - temperature=0.0, - ignore_eos=True, - max_tokens=args.output_len, - ) - print(sampling_params) - dummy_prompt_token_ids = np.random.randint( - 10000, size=(args.batch_size, args.input_len) - ) - dummy_prompts: list[PromptType] = [ - {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist() - ] - - def run_to_completion(): - start_time = time.perf_counter() - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - # Warmup - print("Warming up...") - warmup_latencies = [] - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - warmup_latencies.append(run_to_completion()) - print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s") - - # Profile - profile_dir = args.profile_result_dir - print(f"Profiling (results will be saved to '{profile_dir}')...") - # Enable tracing on server - xp.trace_detached( - "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS - ) - if DELAY_MS == 0: - time.sleep(1.0) - profile_latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profile iterations"): - profile_latencies.append(run_to_completion()) - print(f"Average profile latency: {np.mean(profile_latencies):.4f}s") - - return - - -def parse_args(): - parser = FlexibleArgumentParser( - description="Benchmark the latency of processing a single batch of " - "requests till completion." - ) - parser.add_argument("--input-len", type=int, default=32) - parser.add_argument("--output-len", type=int, default=128) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument( - "--num-iters-warmup", - type=int, - default=5, - help="Number of iterations to run for warmup.", - ) - parser.add_argument( - "--num-iters", - type=int, - default=1, - help="Number of iterations to run for profiling.", - ) - parser.add_argument( - "--profile-result-dir", - type=str, - default="profiles", - help=( - "path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard " - "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)." - ), - ) - - parser = EngineArgs.add_cli_args(parser) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py deleted file mode 100644 index 0093b63b0b1f3..0000000000000 --- a/examples/offline_inference/tpu.py +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import argparse -import os - -from vllm import LLM, SamplingParams - -prompts = [ - "A robot may not injure a human being", - "It is only with the heart that one can see rightly;", - "The greatest glory in living lies not in never falling,", -] -answers = [ - " or, through inaction, allow a human being to come to harm.", - " what is essential is invisible to the eye.", - " but in rising every time we fall.", -] -N = 1 -# Currently, top-p sampling is disabled. `top_p` should be 1.0. -sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) - - -def main(): - parser = argparse.ArgumentParser(description="TPU offline inference example") - parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode") - args = parser.parse_args() - - llm_args = { - "model": "Qwen/Qwen2-1.5B-Instruct", - "max_num_batched_tokens": 64, - "max_num_seqs": 4, - "max_model_len": 128, - } - if args.use_spmd: - os.environ["VLLM_XLA_USE_SPMD"] = "1" - # Can only hardcode the number of chips for now. - # calling xr.global_runtime_device_count() beforeing init SPMD env in - # torch_xla will mess up the distributed env. - llm_args["tensor_parallel_size"] = 8 - # Use Llama, for num_kv_heads = 8. - llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct" - - # Set `enforce_eager=True` to avoid ahead-of-time compilation. - # In real workloads, `enforce_eager` should be `False`. - llm = LLM(**llm_args) - outputs = llm.generate(prompts, sampling_params) - print("-" * 50) - for output, answer in zip(outputs, answers): - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") - assert generated_text.startswith(answer) - print("-" * 50) - - -if __name__ == "__main__": - main() From 986ab5db6325fb4a5d937084ca7921a95641504a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 20 Nov 2025 19:42:33 -0500 Subject: [PATCH 087/249] [CI Bugfix] Fix Kernels DeepGEMM Test (H100) (#29106) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 71249a9543c7c..6169b279dc8a4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -554,7 +554,6 @@ steps: timeout_in_minutes: 45 gpu: h100 num_gpus: 1 - optional: true source_file_dependencies: - tools/install_deepgemm.sh - vllm/utils/deep_gemm.py @@ -565,10 +564,10 @@ steps: - tests/kernels/moe/test_batched_deepgemm.py - tests/kernels/attention/test_deepgemm_attention.py commands: - - pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm - - pytest -v -s tests/kernels/moe/test_deepgemm.py - - pytest -v -s tests/kernels/moe/test_batched_deepgemm.py - - pytest -v -s tests/kernels/attention/test_deepgemm_attention.py + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py - label: Model Executor Test # 23min timeout_in_minutes: 35 From 87cbbdff639f96766d4f6604cc970394c550dc5b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 20 Nov 2025 20:16:52 -0500 Subject: [PATCH 088/249] Update model references for OLMo3 (#29099) Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: Cyrus Leung --- docs/models/supported_models.md | 2 +- tests/models/registry.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f0531ced0aaa3..626904a974155 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -424,7 +424,7 @@ th { | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ | | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ | | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ | -| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ | +| `OLMo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ | | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ | | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 1999e3cd2de2d..b088e16756d7a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -370,7 +370,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ), "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"), "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"), - "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"), + "Olmo3ForCausalLM": _HfExamplesInfo("allenai/Olmo-3-7B-Instruct"), "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"), "OpenPanguMTPModel": _HfExamplesInfo( "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1", From df44df01431e8af444222addddd2789c0483d70a Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:41:49 -0500 Subject: [PATCH 089/249] [Feature] Shared Experts Overlap with FI deepgemm swap kernel, 2.2% throughput improvement and 3.6% TTFT improvement (#28879) Signed-off-by: yewentao256 --- .../fused_moe/fused_moe_modular_method.py | 1 + vllm/model_executor/layers/fused_moe/layer.py | 69 +++++++++------- .../layers/fused_moe/modular_kernel.py | 79 +++++++++++++++++-- .../layers/fused_moe/prepare_finalize.py | 3 +- 4 files changed, 119 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 43974ba917e42..c6dc95acdb636 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -50,6 +50,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): prepare_finalize, old_quant_method.select_gemm_impl(prepare_finalize, moe_layer), shared_experts, + getattr(moe_layer, "shared_experts_stream", None), ), ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d9525a7439c3e..b2f554efd8a6f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -850,6 +850,45 @@ class FusedMoE(CustomOp): dp_size=get_dp_group().world_size, ) + def _maybe_setup_shared_experts_stream( + self, + hidden_states: torch.Tensor, + has_separate_shared_experts: bool, + use_chunked_impl: bool, + ) -> tuple[bool, torch.Tensor | None]: + use_shared_experts_stream = ( + has_separate_shared_experts + and not use_chunked_impl + and self.shared_experts_stream is not None + and ( + hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + ) + ) + + hidden_states_clone: torch.Tensor | None = None + if use_shared_experts_stream: + assert self.shared_experts_stream is not None + + # Clone BEFORE switching streams to avoid race condition + # where routed_expert kernel may mutate hidden_states. + hidden_states_clone = hidden_states.clone() + + # Record that the clone will be used by shared_experts_stream + # to avoid gc issue from deallocation of hidden_states_clone + # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 + # NOTE: We dont need shared_output.record_stream(current_stream()) + # because we synch the streams before using shared_output. + hidden_states_clone.record_stream(self.shared_experts_stream) + + # Mark sync start point for the separate shared experts + # stream here since we want to run in parallel with the + # router/gate (next op below) + assert self.shared_experts_stream is not None + self.shared_experts_stream.wait_stream(current_stream()) + + return use_shared_experts_stream, hidden_states_clone + def _load_per_tensor_weight_scale( self, shard_id: str, @@ -1819,36 +1858,12 @@ class FusedMoE(CustomOp): use_chunked_impl = self.use_dp_chunking - use_shared_experts_stream = ( - has_separate_shared_experts - and not use_chunked_impl - and self.shared_experts_stream is not None - and ( - hidden_states.shape[0] - <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + use_shared_experts_stream, hidden_states_clone = ( + self._maybe_setup_shared_experts_stream( + hidden_states, has_separate_shared_experts, use_chunked_impl ) ) - if use_shared_experts_stream: - assert self.shared_experts_stream is not None - - # Clone BEFORE switching streams to avoid race condition - # where routed_expert kernel may mutate hidden_states. - hidden_states_clone = hidden_states.clone() - - # Record that the clone will be used by shared_experts_stream - # to avoid gc issue from deallocation of hidden_states_clone - # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 - # NOTE: We dont need shared_output.record_stream(current_stream()) - # because we synch the streams before using shared_output. - hidden_states_clone.record_stream(self.shared_experts_stream) - - # Mark sync start point for the separate shared experts - # stream here since we want to run in parallel with the - # router/gate (next op below) - assert self.shared_experts_stream is not None - self.shared_experts_stream.wait_stream(current_stream()) - # If router/gate provided, then apply it here. # (Note: This code runs only when "overlapped mode" is on to allow # parallel execution of shared experts with the FusedMoE via diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 093affe51f503..4af7af9257dfa 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.utils import ( count_expert_num_tokens, disable_inplace, ) +from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, @@ -709,11 +710,13 @@ class FusedMoEModularKernel(torch.nn.Module): prepare_finalize: FusedMoEPrepareAndFinalize, fused_experts: FusedMoEPermuteExpertsUnpermute, shared_experts: torch.nn.Module | None = None, + shared_experts_stream: torch.cuda.Stream | None = None, ): super().__init__() self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts self.shared_experts = shared_experts + self.shared_experts_stream = shared_experts_stream self._post_init_setup() assert ( @@ -890,6 +893,34 @@ class FusedMoEModularKernel(torch.nn.Module): expert_num_tokens_cpu=c_expert_num_tokens_cpu, ) + def _maybe_setup_shared_experts_stream( + self, hidden_states: torch.Tensor + ) -> tuple[bool, torch.Tensor | None]: + # decide whether to run shared experts on a separate CUDA stream to + # overlap with the main fused MoE kernel. + use_shared_experts_stream = ( + self.shared_experts is not None + and self.shared_experts_stream is not None + and hidden_states.is_cuda + and ( + hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + ) + ) + + hidden_states_clone: torch.Tensor | None = None + if use_shared_experts_stream and self.shared_experts_stream is not None: + # TODO: Optimize this (complicated) + # Note: this clone adds overhead but is required + # for correctness with multiple CUDA streams and CUDA graph capture. + hidden_states_clone = hidden_states.clone() + # record that the clone will be used by the separate stream so its + # lifetime is correctly tracked. + hidden_states_clone.record_stream(self.shared_experts_stream) + self.shared_experts_stream.wait_stream(torch.cuda.current_stream()) + + return use_shared_experts_stream, hidden_states_clone + def _prepare( self, hidden_states: torch.Tensor, @@ -1077,12 +1108,30 @@ class FusedMoEModularKernel(torch.nn.Module): topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, + hidden_states_clone: torch.Tensor | None = None, + use_shared_experts_stream: bool = False, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ The _finalize method is a wrapper around self.prepare_finalize.finalize that handles DBO, async and shared expert overlap. """ - shared_output: torch.Tensor | None = None + + def maybe_run_shared_experts() -> torch.Tensor | None: + if self.shared_experts is None: + return None + + if ( + not use_shared_experts_stream + or self.shared_experts_stream is not None + and (not hidden_states.is_cuda or not torch.cuda.is_available()) + ): + # fall back to running on the current stream + return self.shared_experts(hidden_states) + + assert hidden_states_clone is not None + # launch shared experts on the dedicated stream. + with torch.cuda.stream(self.shared_experts_stream): + return self.shared_experts(hidden_states_clone) if not self.prepare_finalize.supports_async(): assert not dbo_enabled() @@ -1095,8 +1144,7 @@ class FusedMoEModularKernel(torch.nn.Module): apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - if self.shared_experts is not None: - shared_output = self.shared_experts(hidden_states) + shared_output = maybe_run_shared_experts() else: finalize_ret = self.prepare_finalize.finalize_async( output, @@ -1107,8 +1155,7 @@ class FusedMoEModularKernel(torch.nn.Module): self.fused_experts.finalize_weight_and_reduce_impl(), ) - if self.shared_experts is not None: - shared_output = self.shared_experts(hidden_states) + shared_output = maybe_run_shared_experts() # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just @@ -1131,12 +1178,28 @@ class FusedMoEModularKernel(torch.nn.Module): receiver() + self._wait_for_shared_experts_stream(hidden_states, use_shared_experts_stream) + if self.shared_experts is None: return output else: assert shared_output is not None return shared_output, output + def _wait_for_shared_experts_stream( + self, hidden_states: torch.Tensor, use_shared_experts_stream: bool + ) -> None: + # ensure that any work enqueued on the shared_experts_stream is + # completed before the shared_output tensor is consumed + if ( + self.shared_experts is not None + and use_shared_experts_stream + and self.shared_experts_stream is not None + and hidden_states.is_cuda + and current_platform.is_cuda() + ): + torch.cuda.current_stream().wait_stream(self.shared_experts_stream) + def forward( self, hidden_states: torch.Tensor, @@ -1183,6 +1246,10 @@ class FusedMoEModularKernel(torch.nn.Module): else: output = torch.zeros_like(hidden_states) + use_shared_experts_stream, hidden_states_clone = ( + self._maybe_setup_shared_experts_stream(hidden_states) + ) + local_num_experts = w1.size(0) if global_num_experts == -1: global_num_experts = local_num_experts @@ -1219,4 +1286,6 @@ class FusedMoEModularKernel(torch.nn.Module): topk_weights, topk_ids, apply_router_weight_on_input, + hidden_states_clone=hidden_states_clone, + use_shared_experts_stream=use_shared_experts_stream, ) diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 9bb976fb9ec93..e27e2eb32da0f 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -45,7 +45,8 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): assert topk == 1, ( "apply_router_weight_on_input is only implemented for topk=1" ) - a1.mul_(topk_weights.to(a1.dtype)) + # Note: do not use inplace for shared experts overlap + a1 = a1 * topk_weights.to(a1.dtype) a1q, a1q_scale = moe_kernel_quantize_input( a1, From 9875be6431872b513a8554c518e48ad79eba4656 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 21 Nov 2025 09:46:43 +0800 Subject: [PATCH 090/249] [LoRA][2/2]Remove LoRA extra vocab (#28545) Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 10 + tests/lora/test_layers.py | 189 ++----------------- tests/lora/test_llama_tp.py | 84 +++++---- tests/lora/test_lora_functions.py | 4 +- tests/lora/test_lora_manager.py | 20 +- tests/lora/test_worker.py | 8 +- tests/lora/utils.py | 8 - vllm/config/lora.py | 18 +- vllm/engine/arg_utils.py | 5 - vllm/lora/layers/base.py | 1 - vllm/lora/layers/base_linear.py | 1 - vllm/lora/layers/column_parallel_linear.py | 1 - vllm/lora/layers/fused_moe.py | 2 - vllm/lora/layers/logits_processor.py | 55 +----- vllm/lora/layers/vocal_parallel_embedding.py | 33 +--- vllm/lora/lora_weights.py | 24 --- vllm/lora/models.py | 54 +----- vllm/lora/punica_wrapper/punica_base.py | 11 +- vllm/lora/punica_wrapper/punica_gpu.py | 5 +- vllm/lora/punica_wrapper/punica_tpu.py | 3 +- vllm/lora/punica_wrapper/punica_xpu.py | 5 +- vllm/lora/utils.py | 10 + vllm/lora/worker_manager.py | 9 +- vllm/model_executor/models/granite.py | 34 +--- vllm/model_executor/models/llama.py | 30 +-- vllm/model_executor/models/mixtral.py | 32 +--- vllm/model_executor/models/teleflm.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 3 - 28 files changed, 133 insertions(+), 528 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index d8ff9339bb49b..9d38ec5422794 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -250,6 +250,16 @@ def olmoe_lora_files(): return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider") +@pytest.fixture(scope="session") +def qwen3_lora_files(): + return snapshot_download(repo_id="charent/self_cognition_Alice") + + +@pytest.fixture(scope="session") +def llama32_lora_files(): + return snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider") + + @pytest.fixture def reset_default_device(): """ diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 8f18f01441932..9df3a07a9e5e9 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -136,7 +136,6 @@ def populate_loras( id_to_index: list[int | None], layer: BaseLayerWithLoRA, layer_weights: torch.Tensor, - generate_embeddings_tensor: int = 0, repeats: int = 1, ) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]: """This method populates the lora layers with lora weights. @@ -148,8 +147,6 @@ def populate_loras( layer: the LoRAlayer to populate. layer_weights: the PyTorch tensor containing the layer's weights. - generate_embeddings_tensor: whether to generate an - embeddings tensor for each LoRA. repeats: must only be set for column parallel packed layers. Indicates the number of loras to compose together to create a single lora layer. @@ -171,7 +168,6 @@ def populate_loras( sublora = DummyLoRAManager(layer_weights.device).init_random_lora( module_name=f"fake_{i}", weight=layer_weights, - generate_embeddings_tensor=generate_embeddings_tensor, ) sublora.lora_b = sublora.lora_b[ (sublora_len * i) : (sublora_len * (i + 1)), : @@ -185,7 +181,6 @@ def populate_loras( slot_idx, lora_a=lora.lora_a, lora_b=lora.lora_b, - embeddings_tensor=lora.embeddings_tensor, ) lora_dict[lora_id] = lora @@ -306,7 +301,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: id_to_index, max_loras, vocab_size, - lora_config.lora_extra_vocab_size, ) lora_result = lora_embedding(torch.cat(inputs)) @@ -344,7 +338,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: id_to_index, max_loras, vocab_size, - lora_config.lora_extra_vocab_size, ) lora_result = lora_embedding(torch.cat(inputs)) @@ -354,149 +347,6 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol) -@torch.inference_mode() -# @pytest.mark.skip( -# reason="Fails when loras are in any slot other than the first.") -@pytest.mark.parametrize("num_loras", [1, 2, 4]) -@pytest.mark.parametrize("device", DEVICES) -@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) -@pytest.mark.parametrize("stage", STAGES) -def test_embeddings_with_new_embeddings( - dist_init, num_loras, device, vocab_size, stage -) -> None: - if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) - - torch.set_default_device(device) - max_loras = 8 - punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras) - assert check_punica_wrapper(punica_wrapper) - lora_config = LoRAConfig( - max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16 - ) - - def create_random_embedding_layer(): - embedding = VocabParallelEmbedding(vocab_size, 256) - embedding_data = torch.rand_like(embedding.weight.data) - embedding.weight.data = embedding_data - embedding.weight.data[vocab_size:, :] = 0 - expanded_embedding = VocabParallelEmbedding( - vocab_size + lora_config.lora_extra_vocab_size * max_loras, - 256, - org_num_embeddings=vocab_size, - ) - expanded_embedding.weight.data[:vocab_size, :] = embedding_data - # We need to deepcopy the embedding as it will be modified - # in place - lora_embedding = VocabParallelEmbeddingWithLoRA(deepcopy(expanded_embedding)) - lora_embedding.create_lora_weights(max_loras, lora_config) - - return expanded_embedding, lora_embedding - - for i in range(NUM_RANDOM_SEEDS): - set_random_seed(i) - - id_to_index = get_random_id_to_index(num_loras, max_loras) - expanded_embedding, lora_embedding = create_random_embedding_layer() - lora_dict, _ = populate_loras( - id_to_index, - layer=lora_embedding, - layer_weights=torch.zeros( - (256, vocab_size + lora_config.lora_extra_vocab_size) - ), - generate_embeddings_tensor=256, - ) - - lora_embedding.set_mapping(punica_wrapper) - # All embeddings tensors have the same shape. - embeddings_tensors = [ - lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys()) - ] - embeddings_tensor_len = embeddings_tensors[0].shape[0] - - # Add empty embeddings_tensors for unoccupied lora slots. - for _ in range(max_loras - len(embeddings_tensors)): - embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape)) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=list(lora_dict.keys()), - num_inputs=num_loras * 3, - input_size=(200,), - input_range=(1, vocab_size), - device=device, - ) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) - punica_wrapper.update_metadata( - lora_mapping, - id_to_index, - max_loras, - vocab_size, - lora_config.lora_extra_vocab_size, - ) - original_inputs = deepcopy(inputs) - - # Force some of the inputs to be in the extended embeddings range - # to guarantee that their behavior is tested. - for input_, original_input_, lora_id in zip( - inputs, original_inputs, prompt_mapping - ): - embedding_id = lora_id - 1 - input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len) - original_input_[-1] = vocab_size - input_[-2] = vocab_size + ((embedding_id + 1) * embeddings_tensor_len - 1) - original_input_[-2] = vocab_size + embeddings_tensor_len - 1 - - expanded_embedding.weight[ - vocab_size : vocab_size + (embeddings_tensor_len * max_loras) - ] = torch.cat(embeddings_tensors) - - lora_result = lora_embedding(torch.cat(original_inputs)) - - expected_results: list[torch.Tensor] = [] - for input_, original_input_, lora_id in zip( - inputs, original_inputs, prompt_mapping - ): - lora = lora_dict[lora_id] - result = expanded_embedding(input_) - after_a = F.embedding( - original_input_, - lora.lora_a.T, - ) - result += after_a @ lora.lora_b.T - expected_results.append(result) - expected_result = torch.cat(expected_results) - - rtol, atol = TOLERANCES[lora_result.dtype] - torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol) - - # Check that resetting the lora weights succeeds - - for slot_idx in range(max_loras): - lora_embedding.reset_lora(slot_idx) - - inputs, index_mapping, prompt_mapping = create_random_inputs( - active_lora_ids=[0], - num_inputs=num_loras * 3, - input_size=(200,), - input_range=(1, vocab_size), - device=device, - ) - original_inputs = deepcopy(inputs) - lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) - punica_wrapper.update_metadata( - lora_mapping, - id_to_index, - max_loras, - vocab_size, - lora_config.lora_extra_vocab_size, - ) - lora_result = lora_embedding(torch.cat(original_inputs)) - expected_result = expanded_embedding(torch.cat(inputs)) - - rtol, atol = TOLERANCES[lora_result.dtype] - torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol) - - @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4]) @pytest.mark.parametrize("device", DEVICES) @@ -518,16 +368,13 @@ def test_lm_head_logits_processor( def _pretest(): linear = ParallelLMHead( - vocab_size + lora_config.lora_extra_vocab_size, - 1024, - vocab_size, + num_embeddings=vocab_size, + embedding_dim=1024, params_dtype=torch.float16, ) linear.weight.data = torch.rand_like(linear.weight.data) linear.weight.data[:, vocab_size:] = 0 - logits_processor = LogitsProcessor( - vocab_size + lora_config.lora_extra_vocab_size, vocab_size - ) + logits_processor = LogitsProcessor(vocab_size) lora_logits_processor = LogitsProcessorWithLoRA( logits_processor, 1024, linear.weight.dtype, linear.weight.device, None ) @@ -541,15 +388,12 @@ def test_lm_head_logits_processor( id_to_index = get_random_id_to_index(num_loras, max_loras) linear, logits_processor, lora_logits_processor = _pretest() lora_logits_processor.set_mapping(punica_wrapper) - # NOTE: all the generated loras share the same embeddings tensor. + lora_dict, _ = populate_loras( id_to_index, layer=lora_logits_processor, layer_weights=linear.weight, - generate_embeddings_tensor=1024, ) - embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor - embeddings_tensor_len = embeddings_tensor.shape[0] inputs, index_mapping, prompt_mapping = create_random_inputs( active_lora_ids=list(lora_dict.keys()), @@ -565,7 +409,6 @@ def test_lm_head_logits_processor( id_to_index, max_loras, vocab_size, - lora_config.lora_extra_vocab_size, ) input_ = torch.rand(20, 1024) @@ -575,23 +418,16 @@ def test_lm_head_logits_processor( original_lm_head = deepcopy(linear) - linear.weight[ - logits_processor.org_vocab_size : logits_processor.org_vocab_size - + embeddings_tensor_len - ] = embeddings_tensor - - logits_processor.org_vocab_size = vocab_size + lora_config.lora_extra_vocab_size expected_results: list[torch.Tensor] = [] for input_, lora_id in zip(inputs, prompt_mapping): lora = lora_dict[lora_id] result = logits_processor._get_logits( hidden_states=input_, lm_head=linear, embedding_bias=None ) - result[:, vocab_size + embeddings_tensor_len :] = float("-inf") + result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling expected_results.append(result) expected_result = torch.cat(expected_results) - logits_processor.org_vocab_size = vocab_size # Check that resetting the lora weights succeeds @@ -612,7 +448,6 @@ def test_lm_head_logits_processor( id_to_index, max_loras, vocab_size, - lora_config.lora_extra_vocab_size, ) lora_result = lora_logits_processor._get_logits( @@ -694,7 +529,6 @@ def test_linear_replicated( id_to_index, max_loras, 512, - lora_config.lora_extra_vocab_size, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -726,7 +560,10 @@ def test_linear_replicated( lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) punica_wrapper.update_metadata( - lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size + lora_mapping, + id_to_index, + max_loras, + 512, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -817,7 +654,6 @@ def test_linear_parallel( id_to_index, max_loras, 512, - lora_config.lora_extra_vocab_size, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -849,7 +685,10 @@ def test_linear_parallel( lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) punica_wrapper.update_metadata( - lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size + lora_mapping, + id_to_index, + max_loras, + 512, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -963,7 +802,6 @@ def test_column_parallel_packed( id_to_index, max_loras, 512, - lora_config.lora_extra_vocab_size, ) lora_result = lora_linear(torch.cat(inputs))[0] @@ -1000,7 +838,6 @@ def test_column_parallel_packed( id_to_index, max_loras, 512, - lora_config.lora_extra_vocab_size, ) lora_result = lora_linear(torch.cat(inputs))[0] diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 7bbd1e364d19e..18704fa6e45de 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -13,17 +13,27 @@ from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test -MODEL_PATH = "meta-llama/Llama-2-7b-hf" +PROMPT_TEMPLATE = """<|eot_id|><|start_header_id|>user<|end_header_id|> +I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. +" +##Instruction: +candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key. +Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key. +The People_ID of candidate is the foreign key of People_ID of people. +###Input: +{context} +###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|> +""" # noqa: E501 EXPECTED_LORA_OUTPUT = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501 + "SELECT count(*) FROM candidate", + "SELECT count(*) FROM candidate", + "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501 + "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501 ] +MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct" + def do_sample( llm: vllm.LLM, @@ -32,18 +42,19 @@ def do_sample( tensorizer_config_dict: dict | None = None, ) -> list[str]: prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]", # noqa: E501 + PROMPT_TEMPLATE.format(context="How many candidates are there?"), + PROMPT_TEMPLATE.format(context="Count the number of candidates."), + PROMPT_TEMPLATE.format( + context="Which poll resource provided the most number of candidate information?" # noqa: E501 + ), + PROMPT_TEMPLATE.format( + context="Return the poll resource associated with the most candidates." + ), ] sampling_params = vllm.SamplingParams( - temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"] + temperature=0, max_tokens=64, stop=["<|im_end|>"] ) - if tensorizer_config_dict is not None: outputs = llm.generate( prompts, @@ -75,13 +86,15 @@ def do_sample( return generated_texts -def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None): +def generate_and_test( + llm, llama32_lora_files, tensorizer_config_dict: dict | None = None +): print("lora adapter created") print("lora 1") assert ( do_sample( llm, - sql_lora_files, + llama32_lora_files, tensorizer_config_dict=tensorizer_config_dict, lora_id=1, ) @@ -92,7 +105,7 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = assert ( do_sample( llm, - sql_lora_files, + llama32_lora_files, tensorizer_config_dict=tensorizer_config_dict, lora_id=2, ) @@ -104,51 +117,52 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = @create_new_process_for_each_test() @pytest.mark.parametrize("cudagraph_specialize_lora", [True, False]) -def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool): +def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool): llm = vllm.LLM( MODEL_PATH, - tokenizer=sql_lora_files, enable_lora=True, # also test odd max_num_seqs - max_num_seqs=13, + max_num_seqs=7, + max_model_len=1024, max_loras=4, compilation_config=vllm.config.CompilationConfig( cudagraph_specialize_lora=cudagraph_specialize_lora, ), ) - generate_and_test(llm, sql_lora_files) + generate_and_test(llm, llama32_lora_files) @multi_gpu_test(num_gpus=4) -def test_llama_lora_tp4(sql_lora_files): +def test_llama_lora_tp4(llama32_lora_files): llm = vllm.LLM( MODEL_PATH, - tokenizer=sql_lora_files, enable_lora=True, - max_num_seqs=16, + max_num_seqs=7, + max_model_len=1024, max_loras=4, tensor_parallel_size=4, ) - generate_and_test(llm, sql_lora_files) + generate_and_test(llm, llama32_lora_files) @multi_gpu_test(num_gpus=4) -def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): +def test_llama_lora_tp4_fully_sharded_loras(llama32_lora_files): llm = vllm.LLM( MODEL_PATH, - tokenizer=sql_lora_files, enable_lora=True, - max_num_seqs=16, + max_num_seqs=8, max_loras=4, + max_model_len=1024, tensor_parallel_size=4, fully_sharded_loras=True, ) - generate_and_test(llm, sql_lora_files) + generate_and_test(llm, llama32_lora_files) @multi_gpu_test(num_gpus=2) def test_tp2_serialize_and_deserialize_lora( - tmp_path, sql_lora_files, sql_lora_huggingface_id + tmp_path, + llama32_lora_files, ): # Run the tensorizing of the LoRA adapter and the model in a subprocess # to guarantee cleanup @@ -157,7 +171,7 @@ def test_tp2_serialize_and_deserialize_lora( model_name = "model-rank-%03d.tensors" model_ref = MODEL_PATH - lora_path = sql_lora_huggingface_id + lora_path = llama32_lora_files suffix = "test" try: result = subprocess.run( @@ -195,12 +209,12 @@ def test_tp2_serialize_and_deserialize_lora( loaded_llm = LLM( model=model_ref, - tokenizer=sql_lora_files, load_format="tensorizer", enable_lora=True, enforce_eager=True, model_loader_extra_config=tensorizer_config, - max_num_seqs=13, + max_num_seqs=7, + max_model_len=1024, tensor_parallel_size=2, max_loras=2, ) @@ -211,7 +225,7 @@ def test_tp2_serialize_and_deserialize_lora( print("lora 1") assert ( do_sample( - loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1 + loaded_llm, llama32_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1 ) == EXPECTED_LORA_OUTPUT ) diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index e914393fee8aa..1c692630284d0 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -13,8 +13,8 @@ from vllm.entrypoints.openai.api_server import ( from vllm.lora.request import LoRARequest from vllm.v1.engine.llm_engine import LLMEngine -MODEL_PATH = "meta-llama/Llama-2-7b-hf" -LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" +MODEL_PATH = "Qwen/Qwen3-0.6B" +LORA_MODULE_PATH = "charent/self_cognition_Alice" LORA_RANK = 8 diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index e7816031142e3..24d4dfca46d62 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -48,9 +48,6 @@ DEFAULT_DTYPE = torch.get_default_dtype() @pytest.mark.parametrize("device", DEVICES) def test_from_lora_tensors(sql_lora_files, device): tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors")) - new_embeddings = load_file( - os.path.join(sql_lora_files, "new_embeddings.safetensors") - ) peft_helper = PEFTHelper.from_local_dir( sql_lora_files, max_position_embeddings=4096 @@ -60,7 +57,6 @@ def test_from_lora_tensors(sql_lora_files, device): tensors, peft_helper=peft_helper, device=device, - embeddings=new_embeddings, embedding_modules=EMBEDDING_MODULES, embedding_padding_modules=EMBEDDING_PADDING_MODULES, ) @@ -76,18 +72,6 @@ def test_from_lora_tensors(sql_lora_files, device): f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" ) assert lora.lora_a.shape[0] == 8 - embeddings_module = next( - (k for k in EMBEDDING_MODULES if k in module_name), None - ) - if embeddings_module: - assert torch.equal( - lora.embeddings_tensor, - new_embeddings[EMBEDDING_MODULES[embeddings_module]].to( - device=lora.embeddings_tensor.device - ), - ) - else: - assert lora.embeddings_tensor is None def create_lora( @@ -552,9 +536,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path worker_adapter_manager = WorkerLoRAManager( vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES ) - worker_adapter_manager.vocab_size = ( - dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size - ) + worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size worker_adapter_manager.create_lora_manager(dummy_model_gate_up) dummy_lora_files = f"{tmp_path}/lora_adapter" diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index c97f8debd1b9a..b163559a9414d 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -20,11 +20,12 @@ from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.v1.worker.gpu_worker import Worker +MODEL_PATH = "Qwen/Qwen3-0.6B" NUM_LORAS = 16 @patch.dict(os.environ, {"RANK": "0"}) -def test_worker_apply_lora(sql_lora_files): +def test_worker_apply_lora(qwen3_lora_files): def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]): lora_mapping = LoRAMapping([], []) @@ -34,9 +35,10 @@ def test_worker_apply_lora(sql_lora_files): vllm_config = VllmConfig( model_config=ModelConfig( - "meta-llama/Llama-2-7b-hf", + MODEL_PATH, seed=0, dtype="float16", + max_model_len=127, enforce_eager=True, ), load_config=LoadConfig( @@ -73,7 +75,7 @@ def test_worker_apply_lora(sql_lora_files): assert worker.list_loras() == set() lora_requests = [ - LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS) + LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS) ] set_active_loras(worker, lora_requests) diff --git a/tests/lora/utils.py b/tests/lora/utils.py index d30b77f094665..6aba5299b5829 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -28,7 +28,6 @@ class DummyLoRAManager: module_name: str, weight: torch.Tensor, rank: int = 8, - generate_embeddings_tensor: int = 0, ): lora = LoRALayerWeights( module_name, @@ -41,13 +40,6 @@ class DummyLoRAManager: [weight.shape[0], rank], dtype=weight.dtype, device=self._device ), ) - if generate_embeddings_tensor: - lora.embeddings_tensor = torch.rand( - 5, - generate_embeddings_tensor, - dtype=weight.dtype, - device=self._device, - ) self.set_module_lora(module_name, lora) return lora diff --git a/vllm/config/lora.py b/vllm/config/lora.py index 84e92eef40077..072e0ec2104f5 100644 --- a/vllm/config/lora.py +++ b/vllm/config/lora.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib -from typing import TYPE_CHECKING, Any, ClassVar, Literal +from typing import TYPE_CHECKING, Any, Literal import torch from pydantic import ConfigDict, Field, model_validator @@ -11,7 +11,6 @@ from typing_extensions import Self from vllm.config.utils import config from vllm.logger import init_logger -from vllm.platforms import current_platform if TYPE_CHECKING: from vllm.config import ModelConfig @@ -46,19 +45,6 @@ class LoRAConfig: `max_loras`.""" lora_dtype: torch.dtype | LoRADType = "auto" """Data type for LoRA. If auto, will default to base model dtype.""" - lora_extra_vocab_size: LoRAExtraVocabSize = Field( - default=256, - deprecated=( - "`lora_extra_vocab_size` is deprecated and will be removed " - "in v0.12.0. Additional vocabulary support for " - "LoRA adapters is being phased out." - ), - ) - """(Deprecated) Maximum size of extra vocabulary that can be present in a - LoRA adapter. Will be removed in v0.12.0.""" - lora_vocab_padding_size: ClassVar[int] = ( - current_platform.get_lora_vocab_padding_size() - ) default_mm_loras: dict[str, str] | None = None """Dictionary mapping specific modalities to LoRA model paths; this field is only applicable to multimodal models and should be leveraged when a @@ -87,8 +73,6 @@ class LoRAConfig: factors.append(self.max_loras) factors.append(self.fully_sharded_loras) factors.append(self.lora_dtype) - factors.append(self.lora_extra_vocab_size) - factors.append(self.lora_vocab_padding_size) hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() return hash_str diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 74828bc109cbe..bcb90119f9b04 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -484,7 +484,6 @@ class EngineArgs: fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras max_cpu_loras: int | None = LoRAConfig.max_cpu_loras lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype - lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override @@ -1011,9 +1010,6 @@ class EngineArgs: ) lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"]) lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"]) - lora_group.add_argument( - "--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"] - ) lora_group.add_argument( "--lora-dtype", **lora_kwargs["lora_dtype"], @@ -1680,7 +1676,6 @@ class EngineArgs: max_loras=self.max_loras, default_mm_loras=self.default_mm_loras, fully_sharded_loras=self.fully_sharded_loras, - lora_extra_vocab_size=self.lora_extra_vocab_size, lora_dtype=self.lora_dtype, max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py index 0c7e806848892..62326c05b2bd1 100644 --- a/vllm/lora/layers/base.py +++ b/vllm/lora/layers/base.py @@ -44,7 +44,6 @@ class BaseLayerWithLoRA(nn.Module): index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): """Overwrites lora tensors at index.""" ... diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 3db4165e20176..e85c5bd70b072 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -96,7 +96,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): # Except for QKVParallelLinearWithLoRA and # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index 637ded9b2a0f0..273c4950e3239 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -248,7 +248,6 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): self.reset_lora(index) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 3291c41fcda1e..adf30855cafc3 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -406,8 +406,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, - bias: torch.Tensor | None = None, ): """Overwrites lora tensors at index.""" self.reset_lora(index) diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index adc5e861f57fb..06f92652031e1 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import math import torch import torch.nn as nn @@ -108,22 +107,13 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): ( max_loras, 1, - # Pad for kernel compatibility - math.ceil( - self.base_layer.vocab_size / lora_config.lora_vocab_padding_size - ) - * lora_config.lora_vocab_padding_size, + self.base_layer.vocab_size, lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, device=self.device, ) - self.embeddings_tensors = torch.full( - (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size), - fill_value=float("-inf"), - dtype=self.dtype, - device=self.device, - ) + if self.sharded_to_full_mapping is not None: self.sharded_to_full_mapping_gpu = torch.tensor( self.sharded_to_full_mapping, device=self.device, dtype=torch.long @@ -134,14 +124,12 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 - self.embeddings_tensors[index] = float("-inf") def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): self.reset_lora(index) self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_( @@ -150,12 +138,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_( lora_b, non_blocking=True ) - if embeddings_tensor is not None: - self.embeddings_tensors[ - index, - : embeddings_tensor.shape[0], - : embeddings_tensor.shape[1], - ] = embeddings_tensor def _get_logits( self, @@ -193,39 +175,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): # token_id: [0, 1, 2, 3, 4, 5, -1, -1] logits = logits[:, self.sharded_to_full_mapping_gpu] - lora_logits = torch.empty( - self.embeddings_tensors.shape[0] + 1, - self.embeddings_tensors.shape[1], - hidden_states.shape[0], - dtype=self.embeddings_tensors.dtype, - device=self.embeddings_tensors.device, - ) - torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1]) - - neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype) - - lora_logits[-1] = neg_inf - lora_logits = lora_logits.mT - indices_padded = self.punica_wrapper.sampler_indices_padded - - if current_platform.is_tpu() or current_platform.is_xpu(): - indices_padded = indices_padded[: logits.size(0)] - - lora_logits = ( - lora_logits.reshape( - lora_logits.shape[0] * lora_logits.shape[1], - lora_logits.shape[2], - ) - .index_select(0, indices_padded) - .nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf) - ) - - logits[ - :, - self.base_layer.org_vocab_size : self.base_layer.org_vocab_size - + lora_logits.shape[1], - ] = lora_logits - lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits( logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0 ) diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index ca4ad8012e9c3..5b1f7886bc238 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -46,19 +46,10 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): self.embeddings_slice = None self.embeddings_weights = None - self.embeddings_tensors = torch.zeros( - ( - max_loras, - lora_config.lora_extra_vocab_size, - self.base_layer.embedding_dim, - ), - dtype=self.base_layer.weight.dtype, - device=self.base_layer.weight.device, - ) self.lora_a_stacked = torch.zeros( ( max_loras, - self.base_layer.org_vocab_size + lora_config.lora_extra_vocab_size, + self.base_layer.org_vocab_size, lora_config.max_lora_rank, ), dtype=lora_config.lora_dtype, @@ -82,14 +73,12 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 - self.embeddings_tensors[index] = 0 def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None, ): self.reset_lora(index) # NOTE self.lora_a_stacked is row-major, and lora_a is col-major, @@ -100,36 +89,18 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA): self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_( lora_b, non_blocking=True ) - if embeddings_tensor is not None: - self.embeddings_tensors[ - index, - : embeddings_tensor.shape[0], - : embeddings_tensor.shape[1], - ].copy_(embeddings_tensor, non_blocking=True) - if self.embeddings_slice is not None: - # TODO(yard1): Optimize this copy, we don't need to copy - # everything, just the modified part - embeddings = self.embeddings_tensors.view( - self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1], - self.embeddings_tensors.shape[2], - )[self.embeddings_slice[0] : self.embeddings_slice[1]] - assert self.embeddings_weights is not None - self.embeddings_weights[: embeddings.shape[0]].copy_(embeddings) def forward(self, x: torch.Tensor) -> torch.Tensor: - added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, 1, 0) - # NB: Don't use torch.narrow here. torch.narrow triggers some # Dynamic Shape specialization in torch.compile num_tokens = x.shape[0] indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens] - indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens] full_lora_a_embeddings = F.embedding( x + indices_1, self.lora_a_stacked_2d, ) - full_output = self.base_layer.forward(x + (indices_0 * added_tokens_mask)) + full_output = self.base_layer.forward(x) full_output_org = full_output if full_output.ndim == 3: diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py index 7691481d5039e..f0d8e22194050 100644 --- a/vllm/lora/lora_weights.py +++ b/vllm/lora/lora_weights.py @@ -21,7 +21,6 @@ class LoRALayerWeights: lora_alpha: int, lora_a: torch.Tensor, lora_b: torch.Tensor, - embeddings_tensor: torch.Tensor | None = None, scaling: float | None = None, ) -> None: self.module_name = module_name @@ -29,7 +28,6 @@ class LoRALayerWeights: self.lora_alpha = lora_alpha self.lora_a = lora_a self.lora_b = lora_b - self.embeddings_tensor = embeddings_tensor if scaling is None: self.scaling = self.lora_alpha / self.rank @@ -56,18 +54,11 @@ class LoRALayerWeights: def is_packed(self) -> bool: return False - @property - def extra_vocab_size(self) -> int: - return ( - self.embeddings_tensor.shape[0] if self.embeddings_tensor is not None else 0 - ) - @classmethod def from_config( cls, module_name: str, peft_helper: PEFTHelper, - embeddings_tensor: torch.Tensor | None = None, ) -> "LoRALayerWeights": # lora_a and lora_b are set to None for config-based construction return cls( @@ -76,7 +67,6 @@ class LoRALayerWeights: peft_helper.lora_alpha, None, None, - embeddings_tensor, peft_helper.vllm_lora_scaling_factor, ) @@ -89,7 +79,6 @@ class LoRALayerWeights: rank: int, dtype: torch.dtype, device: torch.types.Device, - embeddings_tensor_dim: int | None = None, ) -> "LoRALayerWeights": pin_memory = str(device) == "cpu" and is_pin_memory_available() lora_a = torch.zeros( @@ -99,24 +88,12 @@ class LoRALayerWeights: [output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory ) - embeddings_tensor = ( - torch.rand( - 10, - embeddings_tensor_dim, - dtype=dtype, - device=device, - pin_memory=pin_memory, - ) - if embeddings_tensor_dim - else None - ) return cls( module_name, rank=rank, lora_alpha=1, lora_a=lora_a, lora_b=lora_b, - embeddings_tensor=embeddings_tensor, ) @@ -139,7 +116,6 @@ class PackedLoRALayerWeights(LoRALayerWeights): lora_a=lora_a, lora_b=lora_b, scaling=scaling, # type: ignore - embeddings_tensor=None, ) self.lora_alphas = lora_alphas if scaling is None: diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 02c252f15bfab..eb11cd0afc487 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -21,6 +21,7 @@ from vllm.lora.utils import ( from_layer, from_layer_logits_processor, get_supported_lora_modules, + is_base_embeddding_weights, is_regex_target_modules, parse_fine_tuned_lora_name, process_packed_modules_mapping, @@ -93,14 +94,6 @@ class LoRAModel: loras=self.loras.copy(), ) - @property - def extra_vocab_size(self) -> int: - return ( - max(lora.extra_vocab_size for lora in self.loras.values()) - if self.loras - else 0 - ) - def get_lora(self, module_name: str) -> LoRALayerWeights | None: """Get LoRA for a given module by name""" return self.loras.get(module_name, None) @@ -117,7 +110,6 @@ class LoRAModel: peft_helper: PEFTHelper, device: str = "cuda", dtype: torch.dtype | None = None, - embeddings: dict[str, torch.Tensor] | None = None, target_embedding_padding: int | None = None, embedding_modules: dict[str, str] | None = None, embedding_padding_modules: list[str] | None = None, @@ -127,24 +119,14 @@ class LoRAModel: pin_memory = str(device) == "cpu" and is_pin_memory_available() loras: dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): + if is_base_embeddding_weights(tensor_name): + continue module_name, is_lora_a = parse_fine_tuned_lora_name( tensor_name, weights_mapper ) if module_name not in loras: - lora_embeddings_tensor = None - if embeddings: - assert embedding_modules is not None - embeddings_module = next( - (k for k in embedding_modules if k in module_name), None - ) - if embeddings_module: - lora_embeddings_tensor = embeddings[ - embedding_modules[embeddings_module] - ].to(device=device, dtype=dtype) - if pin_memory: - lora_embeddings_tensor = lora_embeddings_tensor.pin_memory() loras[module_name] = LoRALayerWeights.from_config( - module_name, peft_helper, lora_embeddings_tensor + module_name, peft_helper ) if is_lora_a: @@ -206,15 +188,17 @@ class LoRAModel: lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors") lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin") lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt") - new_embeddings_tensor_path = os.path.join( - lora_dir, "new_embeddings.safetensors" - ) - new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") + # new_embeddings_tensor_path = os.path.join( + # lora_dir, "new_embeddings.safetensors" + # ) + # new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin") tensors: dict[str, torch.Tensor] = {} unexpected_modules: list[list[str] | str] = [] def check_unexpected_modules(modules: dict): for lora_module in modules.keys(): # noqa + if is_base_embeddding_weights(lora_module): + continue module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper) # Handle FSDP file format where experts.base_layer is the # gate_up_proj and experts is the down_proj @@ -300,21 +284,12 @@ class LoRAModel: else: raise ValueError(f"{lora_dir} doesn't contain tensors") - embeddings = None - if os.path.isfile(new_embeddings_tensor_path): - embeddings = safetensors.torch.load_file(new_embeddings_tensor_path) - elif os.path.isfile(new_embeddings_bin_file_path): - embeddings = torch.load( - new_embeddings_bin_file_path, map_location=device, weights_only=True - ) - return cls.from_lora_tensors( lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id, tensors=tensors, peft_helper=peft_helper, device=device, dtype=dtype, - embeddings=embeddings, target_embedding_padding=target_embedding_padding, embedding_modules=embedding_modules, embedding_padding_modules=embedding_padding_modules, @@ -474,7 +449,6 @@ class LoRAModelManager: index, module_lora.lora_a, module_lora.lora_b, - module_lora.embeddings_tensor, ) else: module.reset_lora(index) @@ -505,7 +479,6 @@ class LoRAModelManager: self.lora_index_to_id, self.lora_slots + 1, self.vocab_size, - self.lora_config.lora_extra_vocab_size, ) def remove_all_adapters(self): @@ -616,7 +589,6 @@ class LoRAModelManager: if parts[-1] in embedding_modules: input_dim = ( module.base_layer.org_vocab_size - + self.lora_config.lora_extra_vocab_size if hasattr(module.base_layer, "org_vocab_size") else module.base_layer.weight.shape[1] ) @@ -625,11 +597,6 @@ class LoRAModelManager: if hasattr(module.base_layer, "embedding_dim") else module.base_layer.weight.shape[0] ) - embeddings_tensor_dim = ( - module.base_layer.embedding_dim - if hasattr(module.base_layer, "embedding_dim") - else module.base_layer.weight.shape[1] - ) lora = LoRALayerWeights.create_dummy_lora_weights( module_name, input_dim, @@ -637,7 +604,6 @@ class LoRAModelManager: rank, module.lora_a_stacked[0].dtype, "cpu", - embeddings_tensor_dim=embeddings_tensor_dim, ) else: lora = LoRALayerWeights.create_dummy_lora_weights( diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index a6ffbb7b71ce4..7c0fc8167711d 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -31,7 +31,6 @@ class PunicaWrapperABC(ABC): lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, **kwargs, ) -> None: """ @@ -172,8 +171,11 @@ class PunicaWrapperBase(PunicaWrapperABC): lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, ): + # NOTE We have remove lora extra vocab support for now. So we set + # extra_vocab_size alwayzs to 0, and extra_vocab_size will be removed. + + extra_vocab_size = 0 ( base_indices, sampler_indices, @@ -285,12 +287,9 @@ class PunicaWrapperBase(PunicaWrapperABC): lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, **kwargs, ): - self._update_base_metadata( - mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size - ) + self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size) if mapping.is_prefill: # Update metadata required for prefill-related operators. diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index d863a5884d3c5..52138ef0cc3b0 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -65,13 +65,10 @@ class PunicaWrapperGPU(PunicaWrapperBase): lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, **kwargs, ): self.is_prefill = mapping.is_prefill - self._update_base_metadata( - mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size - ) + self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size) # Prepare cuda kernel metadata tensors self.token_mapping_meta.prepare_tensors(self.token_lora_indices) diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index 090878dcd2546..0888772db54e7 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -292,7 +292,6 @@ class PunicaWrapperTPU(PunicaWrapperBase): lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, ): # Make sure we don't accidentally collect outside operations torch_xla.sync() @@ -313,7 +312,7 @@ class PunicaWrapperTPU(PunicaWrapperBase): lora_index_to_id, max_loras, vocab_size, - extra_vocab_size, + 0, # extra_vocab_size "cpu", ) self._token_lora_indices = self._pad_to_shape( diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py index b95087d0ff834..00c00782896cf 100644 --- a/vllm/lora/punica_wrapper/punica_xpu.py +++ b/vllm/lora/punica_wrapper/punica_xpu.py @@ -43,13 +43,10 @@ class PunicaWrapperXPU(PunicaWrapperBase): lora_index_to_id: list[int | None], max_loras: int, vocab_size: int, - extra_vocab_size: int, **kwargs, ): self.is_prefill = mapping.is_prefill - self._update_base_metadata( - mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size - ) + self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size) def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor: return torch.narrow(self._token_lora_indices, 0, 0, x.size(0)) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 0f43ff06d8f2b..a49a7d9d1669d 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -166,6 +166,16 @@ def parse_fine_tuned_lora_name( raise ValueError(f"{name} is unsupported LoRA weight") +def is_base_embeddding_weights(name: str) -> bool: + # hardcoded subfixes for input & output embedding weights + input_embedding_subfix = ".embed_tokens.base_layer.weight" + output_embedding_subfix = ".lm_head.base_layer.weight" + + return name.endswith(input_embedding_subfix) or name.endswith( + output_embedding_subfix + ) + + def is_regex_target_modules( load_modules: str | list[str], expected_lora_modules: list[str] ) -> bool: diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index b85151f2c7592..4cc201a6414f1 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -121,8 +121,7 @@ class WorkerLoRAManager: lora_model_id=lora_request.lora_int_id, device="cpu", dtype=self.lora_config.lora_dtype, - target_embedding_padding=self.vocab_size - + self.lora_config.lora_extra_vocab_size, + target_embedding_padding=self.vocab_size, embedding_modules=self.embedding_modules, embedding_padding_modules=self.embedding_padding_modules, tensorizer_config_dict=lora_request.tensorizer_config_dict, @@ -143,12 +142,6 @@ class WorkerLoRAManager: # For BadRequestError raise e - if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: - raise ValueError( - f"LoRA added vocab size {lora.extra_vocab_size} " - f"is greater than lora_extra_vocab_size " - f"{self.lora_config.lora_extra_vocab_size}." - ) return lora def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 1dc205b47753d..cd7ce2fc8f00a 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -46,7 +46,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -261,29 +260,16 @@ class GraniteModel(nn.Module): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config self.quant_config = quant_config - lora_vocab = ( - (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) - if lora_config - else 0 - ) - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size + if get_pp_group().is_first_rank or ( config.tie_word_embeddings and get_pp_group().is_last_rank ): self.embed_tokens = VocabParallelEmbedding( - self.vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size, quant_config=quant_config, ) else: @@ -420,28 +406,18 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config - self.lora_config = lora_config + self.quant_config = quant_config self.model = GraniteModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) if get_pp_group().is_last_rank: - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size, quant_config=quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) @@ -453,7 +429,7 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): logit_scale /= config.logits_scaling self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, config.vocab_size, scale=logit_scale + config.vocab_size, scale=logit_scale ) else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d5b49d2fb4c26..ebf8addda4a54 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -47,7 +47,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -368,24 +367,18 @@ class LlamaModel(nn.Module): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config self.quant_config = quant_config - lora_vocab = ( - (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) - if lora_config - else 0 - ) - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size + + self.vocab_size = config.vocab_size + if get_pp_group().is_first_rank or ( config.tie_word_embeddings and get_pp_group().is_last_rank ): self.embed_tokens = VocabParallelEmbedding( self.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, quant_config=quant_config, ) else: @@ -562,9 +555,7 @@ class LlamaForCausalLM( super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config self.config = config - self.lora_config = lora_config self.model = self._init_model( vllm_config=vllm_config, @@ -573,20 +564,9 @@ class LlamaForCausalLM( ) if get_pp_group().is_last_rank: - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=( - DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size - ), quant_config=quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) @@ -595,7 +575,7 @@ class LlamaForCausalLM( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, config.vocab_size, logit_scale + config.vocab_size, scale=logit_scale ) else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 54ab8dd493e73..0a9c3f136964e 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -51,7 +51,6 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding, ) @@ -301,23 +300,18 @@ class MixtralModel(nn.Module): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config + parallel_config = vllm_config.parallel_config self.config = config self.quant_config = quant_config - lora_vocab = ( - (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) - if lora_config - else 0 - ) - self.vocab_size = config.vocab_size + lora_vocab + + self.vocab_size = config.vocab_size self.org_vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( self.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, ) self.enable_eplb = parallel_config.enable_eplb @@ -508,34 +502,24 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config + self.config = config - self.lora_config = lora_config + self.quant_config = quant_config self.model = MixtralModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, + config.vocab_size, config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE - # We need bigger padding if using lora for kernel - # compatibility - if not lora_config - else lora_config.lora_vocab_padding_size, quant_config=quant_config, prefix=maybe_prefix(prefix, "lm_head"), ) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight - self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, config.vocab_size - ) + self.logits_processor = LogitsProcessor(config.vocab_size) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors ) diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py index 8a0bec9dff848..bebd7bcaa9249 100644 --- a/vllm/model_executor/models/teleflm.py +++ b/vllm/model_executor/models/teleflm.py @@ -74,5 +74,5 @@ class TeleFLMForCausalLM(LlamaForCausalLM): self.output_mult = self.config.output_mult / self.mup_scale_factor logit_scale = self.output_mult self.logits_processor = LogitsProcessor( - self.unpadded_vocab_size, self.config.vocab_size, logit_scale + self.config.vocab_size, scale=logit_scale ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index e9eb7cad38f88..923c31c187f31 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -219,9 +219,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.hidden_size = model_config.get_hidden_size() self.vocab_size = model_config.get_vocab_size() - if self.lora_config is not None: - self.vocab_size += self.lora_config.lora_extra_vocab_size - # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope From ed6ae1e36a03bed4a29287163e051a7772b1d8b1 Mon Sep 17 00:00:00 2001 From: Xiao Li Date: Thu, 20 Nov 2025 17:54:35 -0800 Subject: [PATCH 091/249] [AITER] [ROCm] Fix crash when loading llama4 model with old aiter version installed, fallback to forward_native implementation (#29124) Signed-off-by: Xiao Li --- vllm/v1/sample/ops/topk_topp_sampler.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index c6c7e924175f7..5b2d130b0ea42 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -60,13 +60,20 @@ class TopKTopPSampler(nn.Module): logprobs_mode not in ("processed_logits", "processed_logprobs") and rocm_aiter_ops.is_enabled() ): - import aiter.ops.sampling # noqa: F401 + try: + import aiter.ops.sampling # noqa: F401 - self.aiter_ops = torch.ops.aiter - logger.info_once( - "Using aiter sampler on ROCm (lazy import, sampling-only)." - ) - self.forward = self.forward_hip + self.aiter_ops = torch.ops.aiter + logger.info_once( + "Using aiter sampler on ROCm (lazy import, sampling-only)." + ) + self.forward = self.forward_hip + except ImportError: + logger.warning_once( + "aiter.ops.sampling is not available on ROCm. " + "Falling back to forward_native implementation." + ) + self.forward = self.forward_native else: self.forward = self.forward_native From e1eefa4c40fc5b28bd7e83b6596bb5d2f420fd92 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 20 Nov 2025 20:54:59 -0500 Subject: [PATCH 092/249] [Bug] Fix torch warning of tf32 usage (#29112) Signed-off-by: yewentao256 --- vllm/model_executor/layers/batch_invariant.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 69fa6bdffd43f..bec7af0286345 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -852,5 +852,6 @@ def init_batch_invariance(): enable_batch_invariant_mode() # Disable TF32 for batch invariance - it causes non-deterministic rounding - torch.backends.cuda.matmul.allow_tf32 = False - torch.backends.cudnn.allow_tf32 = False + torch.backends.cuda.matmul.fp32_precision = "ieee" + torch.backends.cudnn.conv.fp32_precision = "ieee" + torch.backends.cudnn.rnn.fp32_precision = "ieee" From 3f5f36da3fefbae96960f60d41ccf8ac1155515e Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Thu, 20 Nov 2025 22:30:07 -0500 Subject: [PATCH 093/249] [ROCm] Fix for import when building with upstream triton for gfx1100 for gpt-oss serving (#29127) Signed-off-by: Hongxia Yang --- .../layers/quantization/utils/mxfp4_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index cbc46810a26a6..d0c8b3d1a3093 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -39,15 +39,15 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): value_layout = StridedLayout scale_layout = StridedLayout elif current_platform.is_rocm(): - from triton_kernels.tensor_details.layout import ( - GFX950MXScaleLayout, - StridedLayout, - ) - from vllm.platforms.rocm import on_gfx950 value_layout = StridedLayout - scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout + if on_gfx950(): + from triton_kernels.tensor_details.layout import GFX950MXScaleLayout + + scale_layout = GFX950MXScaleLayout + else: + scale_layout = StridedLayout else: value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout( mx_axis=1 From 56669c1f293d5c53b6a19ddf2f78802fa9fff2c2 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 20 Nov 2025 22:36:07 -0500 Subject: [PATCH 094/249] [CI] Fix mypy for `vllm/v1/worker` (#29037) Signed-off-by: yewentao256 --- tools/pre_commit/mypy.py | 2 +- vllm/model_executor/utils.py | 2 +- vllm/multimodal/utils.py | 4 +- vllm/v1/worker/cpu_worker.py | 12 +- vllm/v1/worker/gpu_model_runner.py | 128 +++++++++++------- vllm/v1/worker/gpu_ubatch_wrapper.py | 16 ++- vllm/v1/worker/gpu_worker.py | 62 +++++---- .../worker/kv_connector_model_runner_mixin.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 28 +++- vllm/v1/worker/tpu_worker.py | 5 +- vllm/v1/worker/utils.py | 8 +- vllm/v1/worker/worker_base.py | 2 + vllm/v1/worker/xpu_worker.py | 9 +- 13 files changed, 178 insertions(+), 102 deletions(-) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 8d04848f8f780..34f6e8c928ffb 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -38,6 +38,7 @@ FILES = [ "vllm/usage", "vllm/v1/core", "vllm/v1/engine", + "vllm/v1/worker", ] # After fixing errors resulting from changing follow_imports @@ -62,7 +63,6 @@ SEPARATE_GROUPS = [ "vllm/v1/sample", "vllm/v1/spec_decode", "vllm/v1/structured_output", - "vllm/v1/worker", ] # TODO(woosuk): Include the code from Megatron and HuggingFace. diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 759b809433b14..8aad59e84ff25 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -10,7 +10,7 @@ import torch from vllm.utils.torch_utils import is_torch_equal_or_newer -def set_random_seed(seed: int) -> None: +def set_random_seed(seed: int | None) -> None: from vllm.platforms import current_platform current_platform.seed_everything(seed) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3f55c46ca334d..ac89bdacc01d5 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -3,7 +3,7 @@ import asyncio import atexit -from collections.abc import Iterable, Set +from collections.abc import Generator, Set from concurrent.futures import ThreadPoolExecutor from itertools import groupby from pathlib import Path @@ -403,7 +403,7 @@ def group_mm_kwargs_by_modality( pin_memory: bool = False, merge_by_field_config: bool | None = None, multimodal_cpu_fields: Set[str] = frozenset(), -) -> Iterable[tuple[str, int, BatchedTensorInputs]]: +) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]: """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same modality together into the same `MultiModalKwargs` instance. diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 4420a057d1e58..b080fea1d2dd6 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -3,6 +3,7 @@ import os import platform from collections.abc import Callable +from typing import Any import torch @@ -37,6 +38,9 @@ class CPUWorker(Worker): self.parallel_config.disable_custom_all_reduce = True + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + self.profiler: Any | None = None if envs.VLLM_TORCH_PROFILER_DIR: torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" @@ -80,13 +84,13 @@ class CPUWorker(Worker): self.local_omp_cpuid = "nobind" else: local_dp_rank = self.parallel_config.data_parallel_rank_local - omp_cpuids = omp_cpuids.split("|") + omp_cpuids_list = omp_cpuids.split("|") if local_dp_rank is not None: world_size = self.parallel_config.world_size - omp_cpuids = omp_cpuids[ + omp_cpuids_list = omp_cpuids_list[ local_dp_rank * world_size : (local_dp_rank + 1) * world_size ] - self.local_omp_cpuid = omp_cpuids[self.rank] + self.local_omp_cpuid = omp_cpuids_list[self.rank] if self.local_omp_cpuid != "nobind": ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) @@ -120,7 +124,7 @@ class CPUWorker(Worker): pass def determine_available_memory(self) -> int: - return self.cache_config.cpu_kvcache_space_bytes # type: ignore + return self.cache_config.cpu_kvcache_space_bytes or 0 def compile_or_warm_up_model(self) -> None: # Reset the seed to ensure that the random state is not affected by diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4b0a08ab57e16..a7fa68b20ac50 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5,7 +5,7 @@ import gc import itertools import time from collections import defaultdict -from collections.abc import Iterator +from collections.abc import Iterator, Sequence from contextlib import contextmanager from copy import copy, deepcopy from functools import reduce @@ -53,6 +53,7 @@ from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import ( + SupportsMRoPE, SupportsMultiModal, is_mixture_of_experts, supports_eagle3, @@ -126,6 +127,7 @@ from vllm.v1.outputs import ( ) from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs +from vllm.v1.sample.logits_processor.interface import LogitsProcessor from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.sample.sampler import Sampler @@ -404,7 +406,10 @@ class GPUModelRunner( # solution, we initialize the input batch here, and re-initialize it # in `initialize_kv_cache` if the block_sizes here is different from # the block_sizes in the kv cache config. - custom_logitsprocs = model_config.logits_processors + logits_processors = model_config.logits_processors + custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = ( + tuple(logits_processors) if logits_processors is not None else () + ) self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, # We need to use the encoder length for encoder-decoer @@ -959,9 +964,13 @@ class GPUModelRunner( def _init_mrope_positions(self, req_state: CachedRequestState): model = self.get_model() assert supports_mrope(model), "M-RoPE support is not implemented." + assert req_state.prompt_token_ids is not None, ( + "M-RoPE requires prompt_token_ids to be available." + ) + mrope_model = cast(SupportsMRoPE, model) req_state.mrope_positions, req_state.mrope_position_delta = ( - model.get_mrope_input_positions( + mrope_model.get_mrope_input_positions( req_state.prompt_token_ids, req_state.mm_features, ) @@ -1762,6 +1771,7 @@ class GPUModelRunner( dst_start = mrope_pos_ptr dst_end = mrope_pos_ptr + completion_part_len + assert req.mrope_position_delta is not None MRotaryEmbedding.get_next_input_positions_tensor( out=self.mrope_positions.np, out_offset=dst_start, @@ -1907,6 +1917,8 @@ class GPUModelRunner( for mm_input_id in encoder_input_ids: mm_feature = req_state.mm_features[mm_input_id] + if mm_feature.data is None: + continue mm_hash = mm_feature.identifier mm_kwargs.append(mm_feature.data) mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) @@ -1930,7 +1942,7 @@ class GPUModelRunner( # multimodal inputs. The proper solution should be reordering the # encoder outputs. model = cast(SupportsMultiModal, self.model) - encoder_outputs = [] + encoder_outputs: list[torch.Tensor] = [] for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( mm_kwargs, device=self.device, @@ -1938,7 +1950,7 @@ class GPUModelRunner( merge_by_field_config=model.merge_by_field_config, multimodal_cpu_fields=model.multimodal_cpu_fields, ): - curr_group_outputs = [] + curr_group_outputs: list[torch.Tensor] = [] # EVS-related change. # (ekhvedchenia): Temporary hack to limit peak memory usage when @@ -1980,7 +1992,7 @@ class GPUModelRunner( # 2. A list or tuple (length: num_items) of tensors, # each of shape (feature_size, hidden_size) in case the feature # size is dynamic depending on the input multimodal items. - curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) + curr_group_outputs = model.embed_multimodal(**mm_kwargs_group) # type: ignore[assignment] sanity_check_mm_encoder_outputs( curr_group_outputs, @@ -2180,7 +2192,7 @@ class GPUModelRunner( def sync_and_slice_intermediate_tensors( self, num_tokens: int, - intermediate_tensors: IntermediateTensors, + intermediate_tensors: IntermediateTensors | None, sync_self: bool, ) -> IntermediateTensors: assert self.intermediate_tensors is not None @@ -2397,6 +2409,7 @@ class GPUModelRunner( if is_first_rank: intermediate_tensors = None else: + assert intermediate_tensors is not None intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_input_tokens, intermediate_tensors, True ) @@ -2765,14 +2778,14 @@ class GPUModelRunner( uniform_decode = ( max_num_scheduled_tokens == self.uniform_decode_query_len ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens) - batch_descriptor = BatchDescriptor( + batch_desc = BatchDescriptor( num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=len(self.input_batch.lora_id_to_lora_request) > 0, ) cudagraph_runtime_mode, batch_descriptor = ( self.cudagraph_dispatcher.dispatch( - batch_descriptor, + batch_desc, use_cascade_attn=cascade_attn_prefix_lens is not None, ) ) @@ -2856,15 +2869,15 @@ class GPUModelRunner( else: logits = self.model.compute_logits(sample_hidden_states) - model_output_broadcast_data = {} + model_output_broadcast_data: dict[str, Any] = {} if logits is not None: model_output_broadcast_data["logits"] = logits.contiguous() - model_output_broadcast_data = get_pp_group().broadcast_tensor_dict( + broadcasted = get_pp_group().broadcast_tensor_dict( model_output_broadcast_data, src=len(get_pp_group().ranks) - 1 ) - assert model_output_broadcast_data is not None - logits = model_output_broadcast_data["logits"] + assert broadcasted is not None + logits = broadcasted["logits"] self.execute_model_state = ExecuteModelState( scheduler_output, @@ -2889,7 +2902,7 @@ class GPUModelRunner( if self.execute_model_state is None: # Nothing to do (PP non-final rank case), output isn't used. if not kv_connector_output: - return None # noqa + return None # type: ignore[return-value] # In case of PP with kv transfer, we need to pass through the # kv_connector_output @@ -2941,33 +2954,37 @@ class GPUModelRunner( spec_decode_common_attn_metadata, ) + spec_config = self.speculative_config use_padded_batch_for_eagle = ( - self.speculative_config - and self.speculative_config.use_eagle() - and not self.speculative_config.disable_padded_drafter_batch + spec_config is not None + and spec_config.use_eagle() + and not spec_config.disable_padded_drafter_batch ) effective_drafter_max_model_len = self.max_model_len if effective_drafter_max_model_len is None: effective_drafter_max_model_len = self.model_config.max_model_len if ( - self.speculative_config - and self.speculative_config.draft_model_config is not None - and self.speculative_config.draft_model_config.max_model_len is not None + spec_config is not None + and spec_config.draft_model_config is not None + and spec_config.draft_model_config.max_model_len is not None ): effective_drafter_max_model_len = ( - self.speculative_config.draft_model_config.max_model_len + spec_config.draft_model_config.max_model_len ) input_fits_in_drafter = spec_decode_common_attn_metadata and ( spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens <= effective_drafter_max_model_len ) if use_padded_batch_for_eagle: + assert self.speculative_config is not None + assert isinstance(self.drafter, EagleProposer) sampled_token_ids = sampler_output.sampled_token_ids if input_fits_in_drafter: # EAGLE speculative decoding can use the GPU sampled tokens # as inputs, and does not need to wait for bookkeeping to finish. propose_draft_token_ids(sampled_token_ids) elif self.valid_sampled_token_count_event is not None: + assert spec_decode_common_attn_metadata is not None next_token_ids, valid_sampled_tokens_count = ( self.drafter.prepare_next_token_ids_padded( spec_decode_common_attn_metadata, @@ -3105,7 +3122,9 @@ class GPUModelRunner( common_attn_metadata: CommonAttentionMetadata, ) -> torch.Tensor | list[list[int]]: num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - if self.speculative_config.method == "ngram": + spec_config = self.speculative_config + assert spec_config is not None + if spec_config.method == "ngram": assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, NgramProposer) draft_token_ids = self.drafter.propose( @@ -3115,11 +3134,11 @@ class GPUModelRunner( self.input_batch.token_ids_cpu, self.input_batch.spec_decode_unsupported_reqs, ) - elif self.speculative_config.method == "suffix": + elif spec_config.method == "suffix": assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, SuffixDecodingProposer) draft_token_ids = self.drafter.propose(self.input_batch, sampled_token_ids) - elif self.speculative_config.method == "medusa": + elif spec_config.method == "medusa": assert isinstance(sampled_token_ids, list) assert isinstance(self.drafter, MedusaProposer) @@ -3144,10 +3163,10 @@ class GPUModelRunner( target_hidden_states=hidden_states, sampling_metadata=sampling_metadata, ) - elif self.speculative_config.use_eagle(): + elif spec_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) - if self.speculative_config.disable_padded_drafter_batch: + if spec_config.disable_padded_drafter_batch: # When padded-batch is disabled, the sampled_token_ids should be # the cpu-side list[list[int]] of valid sampled tokens for each # request, with invalid requests having empty lists. @@ -3197,7 +3216,7 @@ class GPUModelRunner( else: target_hidden_states = hidden_states[:num_scheduled_tokens] else: - if self.speculative_config.disable_padded_drafter_batch: + if spec_config.disable_padded_drafter_batch: token_indices_to_sample = None common_attn_metadata, token_indices = self.drafter.prepare_inputs( common_attn_metadata, @@ -3292,9 +3311,12 @@ class GPUModelRunner( and is_mixture_of_experts(self.drafter.model) and self.parallel_config.enable_eplb ): + spec_config = self.vllm_config.speculative_config + assert spec_config is not None + assert spec_config.draft_model_config is not None logger.info_once( "EPLB is enabled for drafter model %s.", - self.vllm_config.speculative_config.draft_model_config.model, + spec_config.draft_model_config.model, ) global_expert_load = ( @@ -3311,7 +3333,7 @@ class GPUModelRunner( self.eplb_state = EplbState(self.parallel_config, self.device) self.eplb_state.add_model( self.drafter.model, - self.vllm_config.speculative_config.draft_model_config, + spec_config.draft_model_config, global_expert_load, old_global_expert_indices, rank_mapping, @@ -3346,9 +3368,11 @@ class GPUModelRunner( scope="local", ) prepare_communication_buffer_for_model(self.model) + mm_config = self.model_config.multimodal_config self.is_multimodal_pruning_enabled = ( supports_multimodal_pruning(self.get_model()) - and self.model_config.multimodal_config.is_multimodal_pruning_enabled() + and mm_config is not None + and mm_config.is_multimodal_pruning_enabled() ) if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb: @@ -3383,15 +3407,14 @@ class GPUModelRunner( # CudagraphWraper and CudagraphDispatcher of vllm. # wrap the model with full cudagraph wrapper if needed. - if ( - self.compilation_config.cudagraph_mode.has_full_cudagraphs() - and not self.parallel_config.enable_dbo - ): + cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None + if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.enable_dbo: self.model = CUDAGraphWrapper( self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL ) elif self.parallel_config.enable_dbo: - if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): + if cudagraph_mode.has_full_cudagraphs(): self.model = UBatchWrapper( self.model, self.vllm_config, CUDAGraphMode.FULL, self.device ) @@ -4071,7 +4094,8 @@ class GPUModelRunner( def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: - if self.model_config.multimodal_config.skip_mm_profiling: + mm_config = self.model_config.multimodal_config + if mm_config is not None and mm_config.skip_mm_profiling: logger.info( "Skipping memory profiling for multimodal encoder and " "encoder cache." @@ -4333,8 +4357,9 @@ class GPUModelRunner( def get_attn_backends_for_group( kv_cache_group_spec: KVCacheGroupSpec, ) -> tuple[dict[AttentionGroupKey, list[str]], set[type[AttentionBackend]]]: + layer_type = cast(type[Any], AttentionLayerBase) layers = get_layers_from_vllm_config( - self.vllm_config, AttentionLayerBase, kv_cache_group_spec.layer_names + self.vllm_config, layer_type, kv_cache_group_spec.layer_names ) attn_backends = {} attn_backend_layers = defaultdict(list) @@ -4349,7 +4374,7 @@ class GPUModelRunner( if layer_name in self.kv_sharing_fast_prefill_eligible_layers: attn_backend = create_fast_prefill_custom_backend( "FastPrefill", - attn_backend, + attn_backend, # type: ignore[arg-type] ) full_cls_name = attn_backend.full_cls_name() @@ -4448,6 +4473,7 @@ class GPUModelRunner( min_cg_backend_name = attn_backend.__name__ # Flexible resolve the cudagraph mode cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None # check cudagraph for mixed batch is supported if ( cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL @@ -4562,12 +4588,17 @@ class GPUModelRunner( self.compilation_config.adjust_cudagraph_sizes_for_spec_decode( self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size ) - self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes + capture_sizes = self.compilation_config.cudagraph_capture_sizes + self.cudagraph_batch_sizes = ( + capture_sizes if capture_sizes is not None else [] + ) # Trigger cudagraph dispatching keys initialization after # resolved cudagraph mode. + cudagraph_mode = self.compilation_config.cudagraph_mode + assert cudagraph_mode is not None self.cudagraph_dispatcher.initialize_cudagraph_keys( - self.compilation_config.cudagraph_mode, self.uniform_decode_query_len + cudagraph_mode, self.uniform_decode_query_len ) def calculate_reorder_batch_threshold(self) -> None: @@ -4579,7 +4610,7 @@ class GPUModelRunner( """ min_none_high = lambda a, b: a if b is None else b if a is None else min(a, b) - reorder_batch_thresholds = [ + reorder_batch_thresholds: list[int | None] = [ group.get_metadata_builder().reorder_batch_threshold for group in self._attn_group_iterator() ] @@ -4588,7 +4619,7 @@ class GPUModelRunner( if len(reorder_batch_thresholds) == 0: self.reorder_batch_threshold = None return - self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds) + self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds) # type: ignore[assignment] @staticmethod def select_common_block_size( @@ -5048,12 +5079,16 @@ class GPUModelRunner( kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) if self.dcp_world_size > 1: - layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase) + layer_type = cast(type[Any], AttentionLayerBase) + layers = get_layers_from_vllm_config(self.vllm_config, layer_type) for layer in layers.values(): - assert layer.impl.need_to_return_lse_for_decode, ( + layer_impl = getattr(layer, "impl", None) + if layer_impl is None: + continue + assert layer_impl.need_to_return_lse_for_decode, ( "DCP requires attention impls to return" " the softmax lse for decode, but the impl " - f"{layer.impl.__class__.__name__} " + f"{layer_impl.__class__.__name__} " "does not return the softmax lse for decode." ) @@ -5094,7 +5129,8 @@ class GPUModelRunner( if has_ec_transfer() and get_ec_transfer().is_producer: return {} kv_cache_spec: dict[str, KVCacheSpec] = {} - attn_layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase) + layer_type = cast(type[Any], AttentionLayerBase) + attn_layers = get_layers_from_vllm_config(self.vllm_config, layer_type) for layer_name, attn_module in attn_layers.items(): if isinstance(attn_module, Attention) and ( kv_tgt_layer := attn_module.kv_sharing_target_layer_name diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py index 9de123263755b..2ce2b64512560 100644 --- a/vllm/v1/worker/gpu_ubatch_wrapper.py +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -121,18 +121,24 @@ class UBatchWrapper: @staticmethod def _create_sm_control_context(vllm_config: VllmConfig): - comm_sms = envs.VLLM_DBO_COMM_SMS + comm_sms: int = envs.VLLM_DBO_COMM_SMS set_comm_sms = lambda sms: None if vllm_config.parallel_config.enable_expert_parallel: # Currently only DeepEP highthroughput supports SM control so this # only affects that case. - all2all_manager = get_ep_group().device_communicator.all2all_manager + ep_group = get_ep_group() + device_communicator = ep_group.device_communicator + all2all_manager = None + if device_communicator is not None: + all2all_manager = device_communicator.all2all_manager - if all2all_manager.max_sms_used() is not None: - comm_sms = min(comm_sms, all2all_manager.max_sms_used()) + if all2all_manager is not None: + max_sms_used = all2all_manager.max_sms_used() + if max_sms_used is not None: + comm_sms = min(comm_sms, max_sms_used) - if comm_sms > 0: + if comm_sms > 0 and all2all_manager is not None: set_comm_sms = lambda sms: all2all_manager.set_num_sms(sms) # TODO(lucas): support other kernels besides DeepGEMM diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 18cbc38262793..f1fd5be966c37 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -6,7 +6,7 @@ import gc import os from contextlib import AbstractContextManager, nullcontext from types import NoneType -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast import torch import torch.distributed @@ -87,8 +87,10 @@ class Worker(WorkerBase): # Buffers saved before sleep self._sleep_saved_buffers: dict[str, torch.Tensor] = {} - # Torch profiler. Enabled and configured through env vars: + # Torch/CUDA profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + # VLLM_TORCH_CUDA_PROFILE=1 + self.profiler: Any | None = None if envs.VLLM_TORCH_PROFILER_DIR: worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" self.profiler = TorchProfilerWrapper( @@ -146,17 +148,17 @@ class Worker(WorkerBase): assert allocator.get_current_usage() == 0, ( "Sleep mode can only be used for one instance per process." ) - context = allocator.use_memory_pool(tag=tag) + return allocator.use_memory_pool(tag=tag) else: - context = nullcontext() - return context + return nullcontext() def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks def init_device(self): - if self.device_config.device.type == "cuda": + device = self.device_config.device + if isinstance(device, torch.device) and device.type == "cuda": # This env var set by Ray causes exceptions with graph building. os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) if ( @@ -375,23 +377,21 @@ class Worker(WorkerBase): from vllm.device_allocator.cumem import CuMemAllocator allocator = CuMemAllocator.get_instance() - context = allocator.use_memory_pool(tag="kv_cache") + with allocator.use_memory_pool(tag="kv_cache"): + self.model_runner.initialize_kv_cache(kv_cache_config) else: - context = nullcontext() - with context: self.model_runner.initialize_kv_cache(kv_cache_config) def compile_or_warm_up_model(self) -> None: # warm up sizes that are not in cudagraph capture sizes, # but users still want to compile for better performance, # e.g. for the max-num-batched token size in chunked prefill. - warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() + compile_sizes = self.vllm_config.compilation_config.compile_sizes + warmup_sizes = compile_sizes.copy() if compile_sizes is not None else [] if not self.model_config.enforce_eager: - warmup_sizes = [ - x - for x in warmup_sizes - if x not in self.vllm_config.compilation_config.cudagraph_capture_sizes - ] + capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes + if capture_sizes is not None: + warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes] # We skip EPLB here since we don't want to record dummy metrics for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) @@ -532,12 +532,12 @@ class Worker(WorkerBase): ) } if forward_pass and not get_pp_group().is_first_rank: - intermediate_tensors = IntermediateTensors( - get_pp_group().recv_tensor_dict( - all_gather_group=get_tp_group(), - all_gather_tensors=all_gather_tensors, - ) + tensor_dict = get_pp_group().recv_tensor_dict( + all_gather_group=get_tp_group(), + all_gather_tensors=all_gather_tensors, ) + assert tensor_dict is not None + intermediate_tensors = IntermediateTensors(tensor_dict) with self.annotate_profile(scheduler_output): output = self.model_runner.execute_model( @@ -605,7 +605,7 @@ class Worker(WorkerBase): assert self.model_runner.eplb_state is not None self.model_runner.eplb_state.rearrange( execute_shuffle=True, - global_expert_load=None, + global_expert_loads=None, rank_mapping=rank_mapping, ) torch.cuda.synchronize() @@ -661,7 +661,7 @@ class Worker(WorkerBase): def _reconfigure_moe( self, old_ep_size: int, new_ep_size: int - ) -> torch.Tensor | None: + ) -> list[torch.Tensor] | None: """ Reconfigure MoE modules with provided reconfig_request @@ -728,26 +728,29 @@ class Worker(WorkerBase): num_local_physical_experts = num_local_experts assert self.model_runner.eplb_state is not None new_physical_experts = ( - self.model_runner.eplb_state.physical_to_logical_map.shape[1] + self.model_runner.eplb_state.physical_to_logical_map.shape[1] # type: ignore[attr-defined] ) parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - - self.model_runner.eplb_state.logical_replica_count.shape[1] + - self.model_runner.eplb_state.logical_replica_count.shape[1] # type: ignore[attr-defined] ) global_expert_loads = None else: - num_local_physical_experts = torch.tensor( + num_local_physical_experts_tensor = torch.tensor( [num_local_experts], dtype=torch.int32, device="cpu" ) torch.distributed.broadcast( - num_local_physical_experts, group=get_ep_group().cpu_group, group_src=0 + num_local_physical_experts_tensor, + group=get_ep_group().cpu_group, + group_src=0, ) - num_local_physical_experts = num_local_physical_experts.item() + num_local_physical_experts = int(num_local_physical_experts_tensor.item()) new_physical_experts = num_local_physical_experts * new_ep_size assert self.model_runner.eplb_state is not None - global_expert_loads = self.model_runner.eplb_state.rearrange( + global_expert_loads_any = self.model_runner.eplb_state.rearrange( execute_shuffle=False ) + global_expert_loads = cast(list[torch.Tensor], global_expert_loads_any) parallel_config.eplb_config.num_redundant_experts = ( new_physical_experts - global_expert_loads[0].shape[1] ) @@ -849,8 +852,9 @@ def init_worker_distributed_environment( init_batch_invariance() set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) + init_method = distributed_init_method or "env://" init_distributed_environment( - parallel_config.world_size, rank, distributed_init_method, local_rank, backend + parallel_config.world_size, rank, init_method, local_rank, backend ) ensure_model_parallel_initialized( diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index e59361f21372a..ff047d8d03f0e 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -59,7 +59,7 @@ class KVConnectorModelRunnerMixin: @staticmethod def ensure_kv_transfer_shutdown() -> None: # has_kv_transfer_group can be None during interpreter shutdown. - if has_kv_transfer_group and has_kv_transfer_group(): + if has_kv_transfer_group and has_kv_transfer_group(): # type: ignore[truthy-function] ensure_kv_transfer_shutdown() @staticmethod diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 923c31c187f31..450160d28649f 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -572,7 +572,10 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): format. Layers that do not need KV cache are not included. """ - layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase) + layers = get_layers_from_vllm_config( + self.vllm_config, + AttentionLayerBase, # type: ignore[type-abstract] + ) block_size = self.vllm_config.cache_config.block_size cache_dtype_str = self.vllm_config.cache_config.cache_dtype @@ -725,7 +728,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): req_id = self.input_batch.req_ids[i] assert req_id is not None num_tokens = scheduler_output.num_scheduled_tokens[req_id] - if not use_max_model_len and num_tokens > self.most_model_len: + if ( + not use_max_model_len + and self.most_model_len is not None + and num_tokens > self.most_model_len + ): use_max_model_len = True num_scheduled_tokens_per_req.append(num_tokens) if use_max_model_len: @@ -737,6 +744,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): else: end_index = num_reqs else: + assert self.num_reqs_most_model_len is not None if len(num_scheduled_tokens_per_req) > self.num_reqs_most_model_len: num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[ : self.num_reqs_most_model_len @@ -829,6 +837,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ].to(self.device) seq_lens = self.seq_lens_cpu[: self.num_reqs_max_model_len].to(self.device) else: + assert self.num_reqs_most_model_len is not None block_tables = self.block_table_cpu[ : self.num_reqs_most_model_len, : self.num_blocks_per_most_len_req ] @@ -931,6 +940,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): for mm_input_id in encoder_input_ids: mm_feature = req_state.mm_features[mm_input_id] + if mm_feature.data is None: + continue mm_hash = mm_feature.identifier mm_kwargs.append(mm_feature.data) mm_hashes_pos.append((mm_hash, mm_feature.mm_position)) @@ -1114,7 +1125,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) -> ModelRunnerOutput: if self.scheduler_output is None: # Nothing to do (PP non-final rank case), output isn't used. - return None # noqa + return None # type: ignore[return-value] scheduler_output = self.scheduler_output mm_embed_inputs = self.mm_embed_inputs self.scheduler_output = None @@ -1696,7 +1707,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ) -> None: # Profile with multimodal encoder & encoder cache. if self.supports_mm_inputs: - if self.model_config.multimodal_config.skip_mm_profiling: + mm_config = self.model_config.multimodal_config + if mm_config is not None and mm_config.skip_mm_profiling: logger.info( "Skipping memory profiling for multimodal encoder and " "encoder cache." @@ -2166,5 +2178,9 @@ def replace_set_lora(model): if isinstance(module, BaseLayerWithLoRA): module._original_set_lora = module.set_lora module._original_reset_lora = module.reset_lora - module.set_lora = _tpu_set_lora.__get__(module, module.__class__) - module.reset_lora = _tpu_reset_lora.__get__(module, module.__class__) + module.set_lora = _tpu_set_lora.__get__( # type: ignore[method-assign] + module, module.__class__ + ) + module.reset_lora = _tpu_reset_lora.__get__( # type: ignore[method-assign] + module, module.__class__ + ) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index a716a9c3aa822..569b2aaa766e4 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -141,8 +141,7 @@ class TPUWorker: # Set random seed. set_random_seed(self.model_config.seed) - if self.model_config.seed is not None: - xm.set_rng_state(self.model_config.seed, self.device) + xm.set_rng_state(self.model_config.seed, self.device) # Increase the cache size limit, which is the maximum number of # dynamo graphs that can be compiled. @@ -332,7 +331,7 @@ class TPUWorker: world_size=parallel_config.world_size, rank=rank, local_rank=local_rank, - distributed_init_method=distributed_init_method, + distributed_init_method=distributed_init_method or "env://", backend=current_platform.dist_backend, ) ensure_model_parallel_initialized( diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 9e99ea964ee08..92e4ce3abdba3 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -280,7 +280,7 @@ def bind_kv_cache( kv_caches: dict[str, torch.Tensor], forward_context: dict[str, "Attention"], runner_kv_caches: list[torch.Tensor], - num_attn_module: int | None = 1, + num_attn_module: int = 1, ) -> None: """ Bind the allocated KV cache to both ModelRunner and forward context so @@ -362,5 +362,7 @@ def is_residual_scattered_for_sp( or vllm_config.compilation_config.use_inductor_graph_partition ): return True - - return num_input_tokens in vllm_config.compilation_config.compile_sizes + compile_sizes = vllm_config.compilation_config.compile_sizes + if compile_sizes is None: + return False + return num_input_tokens in compile_sizes diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 16f321c080779..57e7037e946ec 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -315,10 +315,12 @@ class WorkerWrapperBase: def initialize_from_config(self, kv_cache_configs: list[Any]) -> None: kv_cache_config = kv_cache_configs[self.global_rank] + assert self.vllm_config is not None with set_current_vllm_config(self.vllm_config): self.worker.initialize_from_config(kv_cache_config) # type: ignore def init_device(self): + assert self.vllm_config is not None with set_current_vllm_config(self.vllm_config): # To make vLLM config available during device initialization self.worker.init_device() # type: ignore diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 26c6f8d06bdcd..4d7864e90496a 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os +from typing import Any import torch import torch.distributed @@ -37,6 +38,7 @@ class XPUWorker(Worker): # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + self.profiler: Any | None = None if envs.VLLM_TORCH_PROFILER_DIR: torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" @@ -148,7 +150,12 @@ class XPUWorker(Worker): return int(available_kv_cache_memory) def init_device(self): - if self.device_config.device.type == "xpu" and current_platform.is_xpu(): + device = self.device_config.device + if ( + isinstance(device, torch.device) + and device.type == "xpu" + and current_platform.is_xpu() + ): self.device = torch.device(f"xpu:{self.local_rank}") current_platform.set_device(self.device) current_platform.check_if_supports_dtype(self.model_config.dtype) From 0e741c12e3dc45093b2ddab8a31310703aa27002 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Nov 2025 11:38:35 +0800 Subject: [PATCH 095/249] [Bugfix] Fix Plamo3 rope handling (#29092) Signed-off-by: DarkLight1337 Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/plamo3.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 5bb07722a5fc1..4aeb9d432dcc6 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -62,7 +62,7 @@ class Plamo3Config(PretrainedConfig): # type: ignore # if `sliding_window` is list interleaved_sliding_window: list[int | None] sliding_window_pattern: int - rope_theta: int + rope_parameters: dict[str, Any] rope_local_theta: int # MLP intermediate_size: int @@ -153,13 +153,24 @@ class Plamo3AttentionMixer(nn.Module): quant_config=quant_config, prefix=f"{prefix}.o_proj", ) - layer_idx = extract_layer_index(prefix) - full_attn = config.interleaved_sliding_window[layer_idx] is None - self.rope_theta = config.rope_theta if full_attn else config.rope_local_theta - self.rope_scaling = ( - config.rope_scaling if hasattr(config, "rope_scaling") else None - ) + layer_idx = extract_layer_index(prefix) + layer_type = config.layer_types[layer_idx] + is_sliding = layer_type == "sliding_attention" + + # Initialize the rotary embedding. + if layer_type in config.rope_parameters: + # Transformers v5 rope config. + rope_parameters = config.rope_parameters[layer_type] + else: + # Transformers v4 rope config. + # Global attention. Use the values in config.json. + rope_parameters = config.rope_parameters + # Local attention. Override the values in config.json. + if is_sliding: + rope_parameters = dict( + rope_type="default", rope_theta=config.rope_local_theta + ) max_position = config.max_position_embeddings if hasattr(vllm_config.model_config, "max_model_len") and isinstance( vllm_config.model_config.max_model_len, int @@ -170,8 +181,7 @@ class Plamo3AttentionMixer(nn.Module): self.head_dim, rotary_dim=self.head_dim, max_position=max_position, - base=self.rope_theta, - rope_scaling=self.rope_scaling, + rope_parameters=rope_parameters, ) self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) set_weight_attrs( From a982f5b5ea4a1932424927ea357b532d0e45caf1 Mon Sep 17 00:00:00 2001 From: zhrrr <43847754+izhuhaoran@users.noreply.github.com> Date: Fri, 21 Nov 2025 11:39:09 +0800 Subject: [PATCH 096/249] [kernel][perf] support uncontiguous input for rms_norm kernel (#28103) Signed-off-by: zhuhaoran Signed-off-by: izhuhaoran Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- csrc/dispatch_utils.h | 21 ++++++++ csrc/layernorm_kernels.cu | 80 +++++++++++++++++++++---------- vllm/_custom_ops.py | 5 +- vllm/compilation/matcher_utils.py | 4 +- 4 files changed, 77 insertions(+), 33 deletions(-) diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index 9ae0ed975edde..e1d131e4a7851 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -117,3 +117,24 @@ break; \ } \ } + +#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...) \ + switch (NUM_DIMS) { \ + case 2: { \ + constexpr int tensor_rank = 2; \ + __VA_ARGS__(); \ + break; \ + } \ + case 3: { \ + constexpr int tensor_rank = 3; \ + __VA_ARGS__(); \ + break; \ + } \ + case 4: { \ + constexpr int tensor_rank = 4; \ + __VA_ARGS__(); \ + break; \ + } \ + default: \ + TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \ + } diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 48771e4b3aff9..dfc67b933ccae 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -10,16 +10,38 @@ namespace vllm { // TODO(woosuk): Further optimize this kernel. -template +template __global__ void rms_norm_kernel( - scalar_t* __restrict__ out, // [..., hidden_size] - const scalar_t* __restrict__ input, // [..., hidden_size] - const int64_t input_stride, + scalar_t* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const int64_t input_stride_d2, // input.stride(-2) + const int64_t input_stride_d3, // input.stride(-3) + const int64_t input_stride_d4, // input.stride(-4) + const int64_t input_shape_d2, // input.size(-2) + const int64_t input_shape_d3, // input.size(-3) const scalar_t* __restrict__ weight, // [hidden_size] const float epsilon, const int num_tokens, const int hidden_size) { __shared__ float s_variance; float variance = 0.0f; - const scalar_t* input_row = input + blockIdx.x * input_stride; + const scalar_t* input_row; + if constexpr (NUM_DIMS == 2) { + // 2D for layernorm normal case [batch_size, hidden] + input_row = input + blockIdx.x * input_stride_d2; + } else if constexpr (NUM_DIMS == 3) { + // 3D for q/k norm [batch_size, num_heads, head_size] + int batch_idx = blockIdx.x / input_shape_d2; + int head_idx = blockIdx.x % input_shape_d2; + input_row = + input + batch_idx * input_stride_d3 + head_idx * input_stride_d2; + } else if constexpr (NUM_DIMS == 4) { + // 4D for transformers model_impl qk norm [batch, seq, head, head_dim] + int batch_idx = blockIdx.x / (input_shape_d3 * input_shape_d2); + int remaining = blockIdx.x % (input_shape_d3 * input_shape_d2); + int seq_idx = remaining / input_shape_d2; + int head_idx = remaining % input_shape_d2; + input_row = input + batch_idx * input_stride_d4 + + seq_idx * input_stride_d3 + head_idx * input_stride_d2; + } auto vec_op = [&variance](const vec_n_t& vec) { #pragma unroll @@ -164,38 +186,44 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size] torch::Tensor& weight, // [hidden_size] double epsilon) { TORCH_CHECK(out.is_contiguous()); + if (input.stride(-1) != 1) { + input = input.contiguous(); + } TORCH_CHECK(input.stride(-1) == 1); TORCH_CHECK(weight.is_contiguous()); int hidden_size = input.size(-1); - // We cannot just use `input.stride(-2)` if the tensor is not row-major. - // Instead, we use a 2d view to get the second-innermost stride. - // That way the dimensions (except the last one) can be arbitrarily permuted. - torch::Tensor input_view = input.view({-1, hidden_size}); - - int num_tokens = input_view.numel() / hidden_size; - int64_t input_stride = input_view.stride(-2); + int num_tokens = input.numel() / hidden_size; + int num_dims = input.dim(); + int64_t input_stride_d2 = input.stride(-2); + int64_t input_stride_d3 = (num_dims >= 3) ? input.stride(-3) : 0; + int64_t input_stride_d4 = (num_dims >= 4) ? input.stride(-4) : 0; + int64_t input_shape_d2 = (num_dims >= 3) ? input.size(-2) : 0; + int64_t input_shape_d3 = (num_dims >= 4) ? input.size(-3) : 0; // For large num_tokens, use smaller blocks to increase SM concurrency. const int max_block_size = (num_tokens < 256) ? 1024 : 256; dim3 grid(num_tokens); - const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - input_view.scalar_type(), "rms_norm_kernel", [&] { - const int calculated_vec_size = - std::gcd(16 / sizeof(scalar_t), hidden_size); - const int block_size = - std::min(hidden_size / calculated_vec_size, max_block_size); - dim3 block(block_size); - VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] { - vllm::rms_norm_kernel<<>>( - out.data_ptr(), input_view.data_ptr(), - input_stride, weight.data_ptr(), epsilon, num_tokens, - hidden_size); - }); + VLLM_DISPATCH_RANK234(num_dims, [&] { + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { + const int calculated_vec_size = + std::gcd(16 / sizeof(scalar_t), hidden_size); + const int block_size = + std::min(hidden_size / calculated_vec_size, max_block_size); + dim3 block(block_size); + VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] { + vllm::rms_norm_kernel + <<>>( + out.data_ptr(), input.data_ptr(), + input_stride_d2, input_stride_d3, input_stride_d4, + input_shape_d2, input_shape_d3, weight.data_ptr(), + epsilon, num_tokens, hidden_size); }); + }); + }); } #define LAUNCH_FUSED_ADD_RMS_NORM(width) \ diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 66cf6472eee40..0f625a7945241 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -328,10 +328,7 @@ def rotary_embedding( def rms_norm( out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float ) -> None: - # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input - # If removed, also need to remove contiguous in MatcherRMSNorm - input_contiguous = input.contiguous() - torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon) + torch.ops._C.rms_norm(out, input, weight, epsilon) def fused_add_rms_norm( diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py index 38eb4e5301a18..e4cd063d2aee1 100644 --- a/vllm/compilation/matcher_utils.py +++ b/vllm/compilation/matcher_utils.py @@ -162,12 +162,10 @@ class MatcherRMSNorm(MatcherCustomOp): weight: torch.Tensor, ) -> torch.Tensor: result = torch.empty_like(input) - # TODO: support non-contiguous input for RMSNorm and remove this - input_contiguous = input.contiguous() _, result = auto_functionalized( RMS_OP, result=result, - input=input_contiguous, + input=input, weight=weight, epsilon=self.epsilon, ) From 0730414999343e722590ace615d5814c7e5b6827 Mon Sep 17 00:00:00 2001 From: jeremyteboul <80506730+jeremyteboul@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:39:47 -0800 Subject: [PATCH 097/249] [Core] Add audio_embeds support to chat completions (#29059) Signed-off-by: Jeremy Teboul Co-authored-by: Jeremy Teboul --- docs/features/multimodal_inputs.md | 32 ++++++ tests/entrypoints/test_chat_utils.py | 145 ++++++++++++++++++++++++++ vllm/entrypoints/chat_utils.py | 149 ++++++++++++++++++++++++++- vllm/multimodal/audio.py | 24 +++++ vllm/multimodal/utils.py | 13 ++- 5 files changed, 360 insertions(+), 3 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 5f684604e6031..4656ee43ea251 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -365,6 +365,8 @@ You must enable this feature via `enable_mm_embeds=True`. The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users! +#### Image Embeddings + ??? code ```python @@ -441,6 +443,36 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd print(generated_text) ``` +#### Audio Embeddings + +You can pass pre-computed audio embeddings similar to image embeddings: + +??? code + + ```python + from vllm import LLM + import torch + + # Enable audio embeddings support + llm = LLM(model="fixie-ai/ultravox-v0_5-llama-3_2-1b", enable_mm_embeds=True) + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: