From 0ff70821c9b0b991197fa7f3264bf9dd78b8d4b3 Mon Sep 17 00:00:00 2001
From: Roger Wang
Date: Sun, 23 Nov 2025 20:18:55 -0800
Subject: [PATCH 001/134] [Core] Deprecate `xformers` (#29262)
Signed-off-by: Roger Wang
---
docker/Dockerfile.nightly_torch | 35 +-
.../contributing/ci/update_pytorch_version.md | 15 -
docs/getting_started/quickstart.md | 2 +-
.../openai_embedding_long_text/service.sh | 1 -
requirements/cuda.txt | 1 -
.../test_basic_correctness.py | 3 -
tests/kernels/attention/test_attention.py | 129 ------
.../attention/test_attention_selector.py | 8 +-
tests/kernels/attention/test_mha_attn.py | 4 -
tests/kernels/utils.py | 78 +---
tests/lora/test_minicpmv_tp.py | 12 -
tests/lora/test_qwen2vl.py | 15 -
vllm/attention/backends/registry.py | 1 -
vllm/attention/layer.py | 38 --
vllm/attention/ops/vit_attn_wrappers.py | 38 +-
vllm/attention/selector.py | 9 +-
vllm/config/multimodal.py | 6 +
vllm/envs.py | 1 -
vllm/model_executor/models/dots_ocr.py | 27 +-
vllm/model_executor/models/ernie45_vl.py | 33 +-
vllm/model_executor/models/glm4_1v.py | 31 +-
vllm/model_executor/models/keye.py | 30 +-
vllm/model_executor/models/paddleocr_vl.py | 13 -
vllm/model_executor/models/pixtral.py | 1 +
vllm/model_executor/models/qwen2_5_vl.py | 25 +-
vllm/model_executor/models/qwen2_vl.py | 31 +-
.../models/qwen3_omni_moe_thinker.py | 12 +-
vllm/model_executor/models/qwen3_vl.py | 13 +-
vllm/platforms/cuda.py | 7 +-
vllm/utils/__init__.py | 1 -
vllm/v1/attention/backends/xformers.py | 420 ------------------
31 files changed, 77 insertions(+), 963 deletions(-)
delete mode 100644 vllm/v1/attention/backends/xformers.py
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index b88b9c4992200..d663c82c3885e 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt
-# must put before installing xformers, so it can install the correct version of xfomrers.
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
-# Build xformers with cuda and torch nightly
-# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# todo(elainewy): cache xformers build result for faster build
-ARG max_jobs=16
-ENV MAX_JOBS=${max_jobs}
-ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
- --mount=type=cache,target=/root/.cache/uv \
- echo 'git clone xformers...' \
- && git clone https://github.com/facebookresearch/xformers.git --recursive \
- && cd xformers \
- && git checkout ${XFORMERS_COMMIT} \
- && git submodule update --init --recursive \
- && echo 'finish git clone xformers...' \
- && rm -rf build \
- && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
- && cd .. \
- && rm -rf xformers
-
-RUN --mount=type=cache,target=/root/.cache/uv \
- uv pip install --system xformers-dist/*.whl --verbose
-
# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
@@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system vllm-dist/*.whl --verbose
-# install xformers again for the new environment
-RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
- --mount=type=cache,target=/root/.cache/uv \
- uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
-
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
# install package for build flashinfer
@@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt
# Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN pip freeze | grep -E 'torch|vllm|flashinfer'
# Logging to confirm all the packages are installed
RUN pip freeze
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 09fd85a466eed..735bb2e205332 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -98,21 +98,6 @@ to warm it up so that future builds are faster.
-## Update dependencies
-
-Several vLLM dependencies like xFormers depend on PyTorch and need
-to be updated accordingly. Rather than waiting for all of them to publish new
-releases (which would take too much time), they can be built from
-source to unblock the update process.
-
-### xFormers
-
-```bash
-export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
-MAX_JOBS=16 uv pip install --system \
- --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
-```
-
## Update all the different vLLM platforms
Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index 9e86f785b10c7..94920dc5306b3 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -283,7 +283,7 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
-- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
- On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
For AMD ROCm, you can further control the specific Attention implementation using the following variables:
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh
index 1577de85f7ff2..b5c92749466b0 100644
--- a/examples/online_serving/openai_embedding_long_text/service.sh
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
POOLING_TYPE=${POOLING_TYPE:-"auto"} # auto, MEAN, CLS, LAST
export VLLM_ENABLE_CHUNKED_PROCESSING=true
export CUDA_VISIBLE_DEVICES=2,3,4,5
-# export VLLM_ATTENTION_BACKEND=XFORMERS
echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
echo "=================================================================="
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index d63fe9e1e77c1..15e8aadc56f47 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,6 +9,5 @@ torch==2.9.0
torchaudio==2.9.0
# These must be updated alongside torch
torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.33.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.9
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.5.2
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0cf1e85d4e8ee..521d6c33dd390 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -74,9 +74,6 @@ def test_models(
model_executor: str,
enable_prompt_embeds: bool,
) -> None:
- if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
- pytest.skip(f"{backend} does not support gemma2 with full context length.")
-
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", backend)
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 9662e73321ebe..1a7d5ce0ddc1e 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -13,12 +13,6 @@ from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.platforms import current_platform
from vllm.utils.mem_utils import get_max_shared_memory_bytes
-if not current_platform.is_rocm():
- from xformers import ops as xops
- from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-
- from tests.kernels.utils import make_alibi_bias
-
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# This will change depending on the compute capability.
# - 512 as a buffer
@@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
return torch.cat(ref_outputs, dim=0)
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
- current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention(
- num_seqs: int,
- num_heads: tuple[int, int],
- head_size: int,
- dtype: torch.dtype,
- seed: int,
- device: str,
- use_alibi: bool = False,
-) -> None:
- current_platform.seed_everything(seed)
- torch.set_default_device(device)
- # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
- # As the xformers library is already tested with its own tests, we can use
- # a smaller MAX_SEQ_LEN here.
- max_len = min(MAX_SEQ_LEN, 4096)
- seq_lens = random.sample(range(1, max_len), num_seqs)
- num_tokens = sum(seq_lens)
-
- scale = float(1.0 / (head_size**0.5))
- num_query_heads, num_kv_heads = num_heads
- qkv = torch.empty(
- num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
- )
- qkv.uniform_(-scale, scale)
- query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
-
- num_queries_per_kv = num_query_heads // num_kv_heads
- if num_queries_per_kv > 1:
- # Handle MQA and GQA
- key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
- value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
- alibi_bias = None
- if use_alibi:
- alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
- attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
- output = torch.empty_like(query)
- start = 0
- # Dynamic sequence length not supported with custom attn_bias.
- for i, seq_len in enumerate(seq_lens):
- end = start + seq_len
- out = xops.memory_efficient_attention_forward(
- query[None, start:end],
- key[None, start:end],
- value[None, start:end],
- attn_bias=attn_bias[i],
- p=0.0,
- scale=scale,
- )
- output[start:end].copy_(out.view_as(query[start:end]))
- start += seq_len
- # xformers.AttentionBias to Tensor for use in reference impl.
- alibi_bias = [
- b.materialize((1, num_query_heads, i, i), device=device).squeeze()
- for b, i in zip(attn_bias, seq_lens)
- ]
- else:
- attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
- output = xops.memory_efficient_attention_forward(
- query.unsqueeze(0),
- key.unsqueeze(0),
- value.unsqueeze(0),
- attn_bias=attn_bias,
- p=0.0,
- scale=scale,
- )
- output = output.squeeze(0)
-
- cu_seq_lens = [0]
- for seq_len in seq_lens:
- cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
- ref_output = ref_multi_query_kv_attention(
- cu_seq_lens,
- query,
- key,
- value,
- scale,
- alibi_bias,
- dtype,
- )
- atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
- rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
- torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
-
-
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", [64])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
- current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention_with_alibi(
- num_seqs: int,
- num_heads: tuple[int, int],
- head_size: int,
- dtype: torch.dtype,
- seed: int,
- device: str,
-) -> None:
- return test_multi_query_kv_attention(
- num_seqs,
- num_heads,
- head_size,
- dtype,
- seed,
- device,
- use_alibi=True,
- )
-
-
@pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
head_size = 64
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 9be56a33f76c8..cd34b520ea71b 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -34,7 +34,7 @@ DEVICE_MLA_BACKENDS = {
}
DEVICE_REGULAR_ATTN_BACKENDS = {
- "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
+ "cuda": ["FLASHINFER", "FLASH_ATTN"],
"hip": ["ROCM_ATTN"],
"cpu": ["CPU_ATTN"],
}
@@ -207,12 +207,6 @@ def test_env(
)
expected = "FLASHINFER"
assert backend.get_name() == expected
- elif name == "XFORMERS":
- backend = get_attn_backend(
- 32, torch.float16, None, block_size, use_mla=use_mla
- )
- expected = "XFORMERS"
- assert backend.get_name() == expected
elif name == "FLASH_ATTN":
backend = get_attn_backend(
32, torch.float16, None, block_size, use_mla=use_mla
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index a878ac6396ce5..ae3c63cc62d6b 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -24,10 +24,6 @@ from vllm.platforms.rocm import RocmPlatform
def clear_cache():
"""Clear lru cache to ensure each test case runs without caching."""
_cached_get_attn_backend.cache_clear()
- # Clear xformers availability cache
- import vllm.attention.layer as layer_module
-
- layer_module.USE_XFORMERS_OPS = None
@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 5d5a26fbfc2cd..9307ef7814a8b 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -509,43 +509,6 @@ def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
)
-def make_alibi_bias(
- alibi_slopes: torch.Tensor,
- num_kv_heads: int,
- dtype: torch.dtype,
- seq_lens: list[int],
-) -> list[Any]:
- """Create ALiBi biases compatible with xFormers attention tests."""
- from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
-
- if alibi_slopes is None:
- return [None for _ in seq_lens]
-
- attn_biases: list[Any] = []
- num_heads = alibi_slopes.shape[0]
- assert num_heads >= num_kv_heads, (
- "ALiBi slopes expect at least as many heads as KV heads"
- )
-
- for seq_len in seq_lens:
- bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
- bias = bias[None, :] - bias[:, None]
-
- padded_len = (seq_len + 7) // 8 * 8
- bias_tensor = torch.empty(
- 1,
- num_heads,
- seq_len,
- padded_len,
- device=alibi_slopes.device,
- dtype=dtype,
- )[:, :, :, :seq_len].copy_(bias)
- bias_tensor.mul_(alibi_slopes[:, None, None])
- attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
-
- return attn_biases
-
-
def _make_metadata_tensors(
seq_lens: list[int] | None,
context_lens: list[int] | None,
@@ -649,23 +612,12 @@ def make_kv_cache(
Returns:
- * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
- * for backend 'XFORMERS'
* kv_cache: 2 x num_blocks x block_size x num_heads x head_size
* for backend 'FLASH_ATTN'
"""
- if backend == "XFORMERS":
- kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to(
- device
- )
- elif backend == "FLASH_ATTN":
- kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(
- device
- )
- else:
- raise ValueError(
- f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
- )
+ if backend != "FLASH_ATTN":
+ raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+ kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(device)
if default_val is not None:
kv_cache[:, :, :] = default_val
return kv_cache
@@ -843,22 +795,14 @@ def assert_actual_matches_ideal(
* output_under_test: actually observed output value
"""
ideal_output = test_params.packed_qkvo.ideal_output
- if backend == "XFORMERS":
- torch.testing.assert_close(
- ideal_output, output_under_test.view_as(ideal_output)
- )
-
- elif backend == "FLASH_ATTN":
- # For FlashAttention override the accuracy thresholds to non default
- # values since we notice a higher difference between the ideal and
- # actual output.
- torch.testing.assert_close(
- ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
- )
- else:
- raise ValueError(
- f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
- )
+ if backend != "FLASH_ATTN":
+ raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+ # For FlashAttention override the accuracy thresholds to non default
+ # values since we notice a higher difference between the ideal and
+ # actual output.
+ torch.testing.assert_close(
+ ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
+ )
# Copied/modified from torch._refs.__init__.py
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 1cf8ed602b6a4..e430826461a14 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -57,10 +57,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts
-@pytest.mark.xfail(
- current_platform.is_rocm(),
- reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
@@ -84,10 +80,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
-@pytest.mark.xfail(
- current_platform.is_rocm(),
- reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
@multi_gpu_test(num_gpus=4)
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
@@ -108,10 +100,6 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
@pytest.mark.skipif(
current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
)
-@pytest.mark.xfail(
- current_platform.is_rocm(),
- reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
@multi_gpu_test(num_gpus=4)
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 1800ca107a426..7d8c940100ca4 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -2,12 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
-import pytest
-
import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
from vllm.sampling_params import BeamSearchParams
@@ -142,10 +139,6 @@ QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
-@pytest.mark.xfail(
- current_platform.is_rocm(),
- reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
def test_qwen2vl_lora(qwen2vl_lora_files):
"""Test Qwen 2.0 VL model with LoRA"""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -156,10 +149,6 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
-@pytest.mark.xfail(
- current_platform.is_rocm(),
- reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
"""Test Qwen 2.0 VL model with LoRA through beam search."""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -178,10 +167,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
)
-@pytest.mark.xfail(
- current_platform.is_rocm(),
- reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
-)
def test_qwen25vl_lora(qwen25vl_lora_files):
"""Test Qwen 2.5 VL model with LoRA"""
config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index 6747cf7743b14..125e4e3827747 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -43,7 +43,6 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
- XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
ROCM_AITER_TRITON_MLA = (
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a8e796a1eab63..f1d57ac50fb9f 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -51,31 +51,6 @@ else:
FP8_DTYPE = current_platform.fp8_dtype()
logger = init_logger(__name__)
-USE_XFORMERS_OPS = None
-
-
-def check_xformers_availability():
- global USE_XFORMERS_OPS
- if USE_XFORMERS_OPS is not None:
- return USE_XFORMERS_OPS
-
- if current_platform.is_cuda() and current_platform.has_device_capability(100):
- # Xformers FA is not compatible with B200
- USE_XFORMERS_OPS = False
- else:
- try:
- from importlib.util import find_spec
-
- find_spec("xformers.ops")
- USE_XFORMERS_OPS = True
- except ImportError:
- USE_XFORMERS_OPS = False
-
- # the warning only needs to be shown once
- if not USE_XFORMERS_OPS:
- logger.warning("Xformers is not available, falling back.")
-
- return USE_XFORMERS_OPS
def check_upstream_fa_availability(dtype: torch.dtype):
@@ -533,7 +508,6 @@ class MultiHeadAttention(nn.Module):
if backend
in {
AttentionBackendEnum.TORCH_SDPA,
- AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.PALLAS,
AttentionBackendEnum.ROCM_AITER_FA,
AttentionBackendEnum.FLASH_ATTN,
@@ -549,12 +523,6 @@ class MultiHeadAttention(nn.Module):
)
)
- if (
- self.attn_backend == AttentionBackendEnum.XFORMERS
- and not check_xformers_availability()
- ):
- self.attn_backend = AttentionBackendEnum.TORCH_SDPA
-
self.is_flash_attn_backend = self.attn_backend in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.ROCM_AITER_FA,
@@ -614,12 +582,6 @@ class MultiHeadAttention(nn.Module):
max_seqlen_k=kv_len,
softmax_scale=self.scale,
)
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- from xformers import ops as xops
-
- out = xops.memory_efficient_attention_forward(
- query, key, value, scale=self.scale
- )
elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
query, key, value = (x.transpose(1, 2) for x in (query, key, value))
out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 06a9f7cd82266..46f8f5117f7a7 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -3,7 +3,7 @@
"""
This file contains ops for ViT attention to be compatible with torch.compile
as there are operations here not supported by torch.compile (for instance,
-`to_list` in xformers attn, or `.item()` in flash attention)
+`.item()` in flash attention)
Using these ops and wrapping vision blocks with `torch.compile` can speed up
throughput in vision models by ~5% relative on H100, and improve token
@@ -19,42 +19,6 @@ import torch.nn.functional as F
from vllm.utils.torch_utils import direct_register_custom_op
-def xformers_attn_seqlens_wrapper(
- q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
- from xformers import ops as xops
- from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
- attn_bias = BlockDiagonalMask.from_seqlens(
- q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device
- )
- context_layer = xops.memory_efficient_attention_forward(
- q, k, v, attn_bias=attn_bias, p=0, scale=None
- )
- context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
- return context_layer
-
-
-def xformers_attn_seqlens_wrapper_fake(
- q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
- b, s, h, d = q.shape
- return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
-
-
-direct_register_custom_op(
- op_name="xformers_attn_seqlens_wrapper",
- op_func=xformers_attn_seqlens_wrapper,
- fake_impl=xformers_attn_seqlens_wrapper_fake,
-)
-
-
-def vit_xformers_attn_wrapper(
- q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
- return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens)
-
-
def flash_attn_maxseqlen_wrapper(
q: torch.Tensor,
k: torch.Tensor,
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index e9af08b2316d2..ad19b58aa155c 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -36,7 +36,14 @@ def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
* None otherwise
"""
backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
- return None if backend_name is None else AttentionBackendEnum[backend_name]
+ if backend_name is None:
+ return None
+ if backend_name == "XFORMERS":
+ raise ValueError(
+ "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+ "details). Please select a supported attention backend."
+ )
+ return AttentionBackendEnum[backend_name]
# Global state allows a particular choice of backend
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 9f62b35ed515c..00a81a319bf72 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -173,6 +173,12 @@ class MultiModalConfig:
# We need to import the real type here (deferred to avoid circular import).
from vllm.attention.backends.registry import AttentionBackendEnum
+ if isinstance(value, str) and value.upper() == "XFORMERS":
+ raise ValueError(
+ "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+ "details). Please select a supported attention backend."
+ )
+
if value is None or isinstance(value, AttentionBackendEnum):
return value
diff --git a/vllm/envs.py b/vllm/envs.py
index 9b1ed1fc680b4..56558548d3981 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -640,7 +640,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Example options:
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
# - "FLASH_ATTN": use FlashAttention
- # - "XFORMERS": use XFormers
# - "FLASHINFER": use flashinfer
# - "FLASHMLA": use FlashMLA
# - "FLASH_ATTN_MLA": use FlashAttention for MLA
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 2d2251e83b5b1..5460018d0d67a 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -306,7 +306,6 @@ class DotsVisionAttention(nn.Module):
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.TORCH_SDPA,
- AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -324,7 +323,6 @@ class DotsVisionAttention(nn.Module):
rotary_pos_emb: torch.Tensor | None = None,
*,
max_seqlen: int | None = None,
- seqlens: list[int] | None = None,
) -> torch.Tensor:
# [S, C] -> [S, B=1, C]
x = hidden_states.unsqueeze(1)
@@ -374,16 +372,6 @@ class DotsVisionAttention(nn.Module):
out_i = out_i.permute(0, 2, 1, 3)
outputs.append(out_i)
context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- from xformers import ops as xops
- from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
- attn_bias = BlockDiagonalMask.from_seqlens(
- q_seqlen=seqlens, kv_seqlen=None, device=q.device
- )
- context_layer = xops.memory_efficient_attention_forward(
- q, k, v, attn_bias=attn_bias, p=0, scale=None
- )
else:
raise RuntimeError("Unsupported attention backend")
@@ -545,14 +533,12 @@ class DotsVisionBlock(nn.Module):
cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor,
max_seqlen: int | None = None,
- seqlens: list[int] | None = None,
) -> torch.Tensor:
hidden_states = hidden_states + self.attn(
self.norm1(hidden_states),
cu_seqlens=cu_seqlens,
rotary_pos_emb=rotary_pos_emb,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
return hidden_states
@@ -663,18 +649,14 @@ class DotsVisionTransformer(nn.Module):
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
return rotary_pos_emb
- def compute_attn_mask_seqlen(
- self, cu_seqlens: torch.Tensor
- ) -> tuple[int | None, list[int] | None]:
- max_seqlen, seqlens = None, None
+ def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+ max_seqlen = None
if (
self.attn_backend == AttentionBackendEnum.FLASH_ATTN
or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
):
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
- return max_seqlen, seqlens
+ return max_seqlen
def forward(
self, hidden_states: torch.Tensor, grid_thw: list[list[int]]
@@ -694,14 +676,13 @@ class DotsVisionTransformer(nn.Module):
)
cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
- max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+ max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
for blk in self.blocks:
hidden_states = blk(
hidden_states,
cu_seqlens=cu_seqlens,
rotary_pos_emb=rotary_pos_emb,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
if self.post_trunk_norm is not None:
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index daa5bf03ea4a9..07b34fbc8addb 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -214,7 +214,6 @@ class Ernie4_5_VisionAttention(nn.Module):
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.TORCH_SDPA,
- AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -259,7 +258,6 @@ class Ernie4_5_VisionAttention(nn.Module):
cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor,
max_seqlen: int | None = None, # Only used for Flash Attention
- seqlens: list[int] | None = None, # Only used for xFormers
) -> torch.Tensor:
# [s, b, c] --> [s, b, head * 3 * head_dim]
x, _ = self.qkv(x)
@@ -311,20 +309,6 @@ class Ernie4_5_VisionAttention(nn.Module):
context_layer = rearrange(
context_layer, "b s h d -> s b (h d)"
).contiguous()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- from xformers import ops as xops
- from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
- attn_bias = BlockDiagonalMask.from_seqlens(
- q_seqlen=seqlens, kv_seqlen=None, device=q.device
- )
-
- context_layer = xops.memory_efficient_attention_forward(
- q, k, v, attn_bias=attn_bias, p=0, scale=None
- )
- context_layer = rearrange(
- context_layer, "b s h d -> s b (h d)"
- ).contiguous()
output, _ = self.proj(context_layer)
return output
@@ -404,14 +388,12 @@ class Ernie4_5_VisionBlock(nn.Module):
cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor,
max_seqlen: int | None = None, # Only used for Flash Attention
- seqlens: list[int] | None = None, # Only used for xFormers
) -> torch.Tensor:
hidden_states = hidden_states + self.attn(
self.norm1(hidden_states),
cu_seqlens=cu_seqlens,
rotary_pos_emb=rotary_pos_emb,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
return hidden_states
@@ -562,18 +544,14 @@ class Ernie4_5_VisionTransformer(nn.Module):
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
return rotary_pos_emb
- def compute_attn_mask_seqlen(
- self, cu_seqlens: torch.Tensor
- ) -> tuple[int | None, list[int] | None]:
- max_seqlen, seqlens = None, None
+ def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+ max_seqlen = None
if (
self.attn_backend == AttentionBackendEnum.FLASH_ATTN
or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
):
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
- return max_seqlen, seqlens
+ return max_seqlen
def forward(
self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, num_pad=0
@@ -598,8 +576,8 @@ class Ernie4_5_VisionTransformer(nn.Module):
if hidden_states.ndim == 2:
hidden_states = hidden_states.unsqueeze(dim=1)
- # pre-compute seqlens for attn mask to reduce cuMemcpy operations
- max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+ # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+ max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
for i, blk in enumerate(self.blocks):
hidden_states = blk(
@@ -607,7 +585,6 @@ class Ernie4_5_VisionTransformer(nn.Module):
cu_seqlens=cu_seqlens,
rotary_pos_emb=rotary_pos_emb,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
final_output = self.ln(hidden_states)
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index d141e95498064..7e0370886884f 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -309,7 +309,6 @@ class Glm4vVisionAttention(nn.Module):
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.TORCH_SDPA,
- AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -345,7 +344,6 @@ class Glm4vVisionAttention(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: int | None = None, # Only used for Flash Attention
- seqlens: list[int] | None = None, # Only used for xFormers
) -> torch.Tensor:
# [s, b, c] --> [s, b, head * 3 * head_dim]
x, _ = self.qkv(x)
@@ -400,20 +398,6 @@ class Glm4vVisionAttention(nn.Module):
context_layer = rearrange(
context_layer, "b s h d -> s b (h d)"
).contiguous()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- from xformers import ops as xops
- from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
- attn_bias = BlockDiagonalMask.from_seqlens(
- q_seqlen=seqlens, kv_seqlen=None, device=q.device
- )
-
- context_layer = xops.memory_efficient_attention_forward(
- q, k, v, attn_bias=attn_bias, p=0, scale=None
- )
- context_layer = rearrange(
- context_layer, "b s h d -> s b (h d)"
- ).contiguous()
output, _ = self.proj(context_layer)
return output
@@ -461,7 +445,6 @@ class Glm4vVisionBlock(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: int | None = None, # Only used for Flash Attention
- seqlens: list[int] | None = None, # Only used for xFormers
) -> torch.Tensor:
x_attn = self.attn(
self.norm1(x),
@@ -469,7 +452,6 @@ class Glm4vVisionBlock(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
x_fused_norm, residual = self.norm2(x, residual=x_attn)
x = residual + self.mlp(x_fused_norm)
@@ -803,15 +785,14 @@ class Glm4vVisionTransformer(nn.Module):
def compute_attn_mask_seqlen(
self,
cu_seqlens: torch.Tensor,
- ) -> tuple[int | None, list[int] | None]:
- max_seqlen, seqlens = None, None
- seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+ ) -> int | None:
+ max_seqlen = None
if (
self.attn_backend == AttentionBackendEnum.FLASH_ATTN
or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
):
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
- return max_seqlen, seqlens
+ return max_seqlen
def forward(
self,
@@ -836,8 +817,9 @@ class Glm4vVisionTransformer(nn.Module):
).cumsum(dim=0, dtype=torch.int32)
cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
- # pre-compute seqlens for attn mask to reduce cuMemcpy operations
- max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+ # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+ max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
+ seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
x = self.embeddings(
x, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
)
@@ -851,7 +833,6 @@ class Glm4vVisionTransformer(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
# adapter
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 8fc3db296aa79..302260b952992 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -9,6 +9,7 @@ from typing import Annotated, Any, Literal, TypeAlias, TypeVar
import numpy as np
import torch
import torch.nn as nn
+import torch.nn.functional as F
from einops import rearrange
from transformers import PretrainedConfig
from transformers.activations import GELUActivation
@@ -424,7 +425,7 @@ class KeyeSiglipAttention(nn.Module):
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
- AttentionBackendEnum.XFORMERS,
+ AttentionBackendEnum.TORCH_SDPA,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -451,7 +452,6 @@ class KeyeSiglipAttention(nn.Module):
)
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
- seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
batch_size = q.shape[0]
if rope_emb is None:
@@ -498,17 +498,21 @@ class KeyeSiglipAttention(nn.Module):
softmax_scale=self.scale,
)
context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- from xformers import ops as xops
- from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
- attn_bias = BlockDiagonalMask.from_seqlens(
- q_seqlen=seqlens, kv_seqlen=None, device=q.device
- )
-
- context_layer = xops.memory_efficient_attention_forward(
- q, k, v, attn_bias=attn_bias, p=0, scale=None
- )
+ elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+ outputs = []
+ for i in range(1, len(cu_seqlens)):
+ start_idx = cu_seqlens[i - 1]
+ end_idx = cu_seqlens[i]
+ q_i = q[:, start_idx:end_idx]
+ k_i = k[:, start_idx:end_idx]
+ v_i = v[:, start_idx:end_idx]
+ q_i, k_i, v_i = (
+ rearrange(x, "b s h d -> b h s d") for x in (q_i, k_i, v_i)
+ )
+ output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+ output_i = rearrange(output_i, "b h s d -> b s h d ")
+ outputs.append(output_i)
+ context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
context_layer = rearrange(context_layer, "b s h d -> b s (h d)").contiguous()
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index dee0c16ab0f63..74bb868492da9 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -38,7 +38,6 @@ from vllm.attention.layer import (
)
from vllm.attention.ops.vit_attn_wrappers import (
vit_flash_attn_wrapper,
- vit_xformers_attn_wrapper,
)
from vllm.config import VllmConfig
from vllm.config.multimodal import BaseDummyOptions
@@ -657,7 +656,6 @@ class SiglipAttention(nn.Module):
cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor | None,
max_seqlen: torch.Tensor | None,
- seqlens: torch.Tensor | None,
) -> torch.Tensor:
batch_size, _, _ = hidden_states.shape
@@ -703,10 +701,6 @@ class SiglipAttention(nn.Module):
context_layer = rearrange(
context_layer, "b s h d -> s b (h d)"
).contiguous()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- if seqlens is None:
- raise ValueError("xFormers attention backend requires seqlens tensor.")
- context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
else:
raise RuntimeError(
f"PaddleOCR-VL does not support {self.attn_backend} backend now."
@@ -818,7 +812,6 @@ class SiglipEncoderLayer(nn.Module):
cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor | None,
max_seqlen: torch.Tensor | None,
- seqlens: torch.Tensor | None,
) -> torch.Tensor:
residual = hidden_states
@@ -828,7 +821,6 @@ class SiglipEncoderLayer(nn.Module):
cu_seqlens=cu_seqlens,
rotary_pos_emb=rotary_pos_emb,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
hidden_states = residual + hidden_states
@@ -870,7 +862,6 @@ class SiglipEncoder(nn.Module):
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.TORCH_SDPA,
- AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -943,14 +934,11 @@ class SiglipEncoder(nn.Module):
cu_seqlens = cu_seqlens.to(device=device)
max_seqlen = None
- seqlens = None
if self.attn_backend in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.ROCM_AITER_FA,
}:
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
hidden_states = inputs_embeds
for encoder_layer in self.layers:
@@ -959,7 +947,6 @@ class SiglipEncoder(nn.Module):
cu_seqlens=cu_seqlens,
rotary_pos_emb=rotary_pos_emb,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
return hidden_states
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8a034fd72b02a..6011d93a795d1 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -74,6 +74,7 @@ from .vision import (
)
try:
+ # Note: vLLM does not install xformers by default.
from xformers import ops as xops
if current_platform.is_cuda() and current_platform.has_device_capability(100):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1500a437613cc..8c707c2561af1 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -46,7 +46,6 @@ from vllm.attention.layer import maybe_get_vit_flash_attn_backend
from vllm.attention.ops.vit_attn_wrappers import (
vit_flash_attn_wrapper,
vit_torch_sdpa_wrapper,
- vit_xformers_attn_wrapper,
)
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
@@ -375,7 +374,6 @@ class Qwen2_5_VisionAttention(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: torch.Tensor, # Only used for Flash Attention
- seqlens: torch.Tensor, # Only used for xFormers
) -> torch.Tensor:
# [s, b, c] --> [s, b, head * 3 * head_dim]
x, _ = self.qkv(x)
@@ -435,8 +433,6 @@ class Qwen2_5_VisionAttention(nn.Module):
v,
cu_seqlens,
)
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
output, _ = self.proj(context_layer)
return output
@@ -448,9 +444,7 @@ class Qwen2_5_VisionAttention(nn.Module):
"cu_seqlens": 0,
"rotary_pos_emb_cos": 0,
"rotary_pos_emb_sin": 0,
- "seqlens": 0,
},
- mark_unbacked_dims={"seqlens": 0},
enable_if=should_torch_compile_mm_vit,
)
class Qwen2_5_VisionBlock(nn.Module):
@@ -501,7 +495,6 @@ class Qwen2_5_VisionBlock(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: torch.Tensor, # Only used for Flash Attention
- seqlens: torch.Tensor, # Only used for xFormers
) -> torch.Tensor:
x_attn = self.attn(
self.norm1(x),
@@ -509,7 +502,6 @@ class Qwen2_5_VisionBlock(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
x_fused_norm, residual = self.norm2(x, residual=x_attn)
x = residual + self.mlp(x_fused_norm)
@@ -670,7 +662,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.TORCH_SDPA,
- AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -822,17 +813,14 @@ class Qwen2_5_VisionTransformer(nn.Module):
def compute_attn_mask_seqlen(
self,
cu_seqlens: torch.Tensor,
- ) -> tuple[torch.Tensor, torch.Tensor]:
+ ) -> torch.Tensor:
max_seqlen = torch.zeros([], device=cu_seqlens.device)
- seqlens = torch.zeros(1, device=cu_seqlens.device)
if self.attn_backend in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.ROCM_AITER_FA,
}:
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
- return max_seqlen, seqlens
+ return max_seqlen
@staticmethod
def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
@@ -897,10 +885,8 @@ class Qwen2_5_VisionTransformer(nn.Module):
# transformers
# pre-compute seqlens for window/full attn to reduce cuMemcpy operations
- max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(cu_seqlens)
- max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
- cu_window_seqlens
- )
+ max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
+ max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
@@ -927,11 +913,9 @@ class Qwen2_5_VisionTransformer(nn.Module):
if layer_num in self.fullatt_block_indexes:
cu_seqlens_now = cu_seqlens
max_seqlen_now = max_seqlen_full
- seqlens_now = seqlens_full
else:
cu_seqlens_now = cu_window_seqlens
max_seqlen_now = max_seqlen_window
- seqlens_now = seqlens_window
hidden_states = blk(
hidden_states,
@@ -939,7 +923,6 @@ class Qwen2_5_VisionTransformer(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen_now,
- seqlens=seqlens_now,
)
# For Qwen2.5-VL-3B, float16 will overflow at last block
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 479a7871e364f..9d1d023aed172 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -348,7 +348,6 @@ class Qwen2VisionAttention(nn.Module):
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.TORCH_SDPA,
- AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -384,7 +383,6 @@ class Qwen2VisionAttention(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: int | None = None, # Only used for Flash Attention
- seqlens: list[int] | None = None, # Only used for xFormers
) -> torch.Tensor:
# [s, b, c] --> [s, b, 3 * head * head_dim]
x, _ = self.qkv(x)
@@ -445,20 +443,6 @@ class Qwen2VisionAttention(nn.Module):
context_layer = rearrange(
context_layer, "b s h d -> s b (h d)"
).contiguous()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- from xformers import ops as xops
- from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
- attn_bias = BlockDiagonalMask.from_seqlens(
- q_seqlen=seqlens, kv_seqlen=None, device=q.device
- )
-
- context_layer = xops.memory_efficient_attention_forward(
- q, k, v, attn_bias=attn_bias, p=0, scale=None
- )
- context_layer = rearrange(
- context_layer, "b s h d -> s b (h d)"
- ).contiguous()
output, _ = self.proj(context_layer)
return output
@@ -509,7 +493,6 @@ class Qwen2VisionBlock(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: int | None = None, # Only used for Flash Attention
- seqlens: list[int] | None = None, # Only used for xFormers
) -> torch.Tensor:
x = x + self.attn(
self.norm1(x),
@@ -517,7 +500,6 @@ class Qwen2VisionBlock(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
x = x + self.mlp(self.norm2(x))
@@ -728,18 +710,14 @@ class Qwen2VisionTransformer(nn.Module):
sin_combined = sin[pos_ids].flatten(1)
return cos_combined, sin_combined
- def compute_attn_mask_seqlen(
- self, cu_seqlens: torch.Tensor
- ) -> tuple[int | None, list[int] | None]:
- max_seqlen, seqlens = None, None
+ def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+ max_seqlen = None
if self.attn_backend in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.ROCM_AITER_FA,
}:
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
- return max_seqlen, seqlens
+ return max_seqlen
def forward(
self,
@@ -771,7 +749,7 @@ class Qwen2VisionTransformer(nn.Module):
x = x.unsqueeze(1)
# pre-compute seqlens for attn mask to reduce cuMemcpy operations
- max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+ max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
for blk in self.blocks:
x = blk(
@@ -780,7 +758,6 @@ class Qwen2VisionTransformer(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
# adapter
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 54ef56f83344e..61f218f16d79c 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -224,7 +224,6 @@ class Qwen3_VisionBlock(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: torch.Tensor, # Only used for Flash Attention
- seqlens: torch.Tensor, # Only used for xFormers
) -> torch.Tensor:
x = x + self.attn(
self.norm1(x),
@@ -232,7 +231,6 @@ class Qwen3_VisionBlock(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
x = x + self.mlp(self.norm2(x))
@@ -500,14 +498,11 @@ class Qwen3Omni_VisionTransformer(nn.Module):
def compute_attn_mask_seqlen(
self,
cu_seqlens: torch.Tensor,
- ) -> tuple[torch.Tensor, torch.Tensor]:
+ ) -> torch.Tensor:
max_seqlen = torch.zeros([], device=cu_seqlens.device)
- seqlens = torch.zeros(1, device=cu_seqlens.device)
if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
- return max_seqlen, seqlens
+ return max_seqlen
def forward(
self,
@@ -533,7 +528,7 @@ class Qwen3Omni_VisionTransformer(nn.Module):
hidden_states = hidden_states.unsqueeze(1)
rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device)
rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device)
- max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+ max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
hidden_states_list = []
deepstack_visual_indexes = self.deepstack_visual_indexes
@@ -545,7 +540,6 @@ class Qwen3Omni_VisionTransformer(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
if (
deepstack_visual_indexes is not None
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 90c4894d33e88..4cd6fa14c32df 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -235,7 +235,6 @@ class Qwen3_VisionBlock(nn.Module):
rotary_pos_emb_cos: torch.Tensor,
rotary_pos_emb_sin: torch.Tensor,
max_seqlen: torch.Tensor, # Only used for Flash Attention
- seqlens: torch.Tensor, # Only used for xFormers
) -> torch.Tensor:
x = x + self.attn(
self.norm1(x),
@@ -243,7 +242,6 @@ class Qwen3_VisionBlock(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
x = x + self.mlp(self.norm2(x))
@@ -391,7 +389,6 @@ class Qwen3_VisionTransformer(nn.Module):
if self.attn_backend not in {
AttentionBackendEnum.FLASH_ATTN,
AttentionBackendEnum.TORCH_SDPA,
- AttentionBackendEnum.XFORMERS,
AttentionBackendEnum.ROCM_AITER_FA,
}:
raise RuntimeError(
@@ -531,17 +528,14 @@ class Qwen3_VisionTransformer(nn.Module):
def compute_attn_mask_seqlen(
self,
cu_seqlens: torch.Tensor,
- ) -> tuple[torch.Tensor, torch.Tensor]:
+ ) -> torch.Tensor:
max_seqlen = torch.zeros([], device=cu_seqlens.device)
- seqlens = torch.zeros(1, device=cu_seqlens.device)
if (
self.attn_backend == AttentionBackendEnum.FLASH_ATTN
or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
):
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
- elif self.attn_backend == AttentionBackendEnum.XFORMERS:
- seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
- return max_seqlen, seqlens
+ return max_seqlen
def forward(
self,
@@ -569,7 +563,7 @@ class Qwen3_VisionTransformer(nn.Module):
cu_seqlens = torch.from_numpy(cu_seqlens)
hidden_states = hidden_states.unsqueeze(1)
- max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+ max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
deepstack_feature_lists = []
@@ -580,7 +574,6 @@ class Qwen3_VisionTransformer(nn.Module):
rotary_pos_emb_cos=rotary_pos_emb_cos,
rotary_pos_emb_sin=rotary_pos_emb_sin,
max_seqlen=max_seqlen,
- seqlens=seqlens,
)
if layer_num in self.deepstack_visual_indexes:
deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index f9bf242b7194e..06793a3d1bb14 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -277,12 +277,7 @@ class CudaPlatformBase(Platform):
except ImportError:
pass
- if cls.has_device_capability(100):
- # xFormers doesn't support Blackwell, fall back to SDPA
- # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
- return AttentionBackendEnum.TORCH_SDPA
- else:
- return AttentionBackendEnum.XFORMERS
+ return AttentionBackendEnum.TORCH_SDPA
@classmethod
def get_valid_backends(
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 3ef44e7703204..d94da71b289f3 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -49,7 +49,6 @@ STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
# Possible string values of STR_BACKEND_ENV_VAR
# register, corresponding to possible backends
STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
STR_INVALID_VAL: str = "INVALID"
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
deleted file mode 100644
index 5039c44b9c3e6..0000000000000
--- a/vllm/v1/attention/backends/xformers.py
+++ /dev/null
@@ -1,420 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with XFormersAttention."""
-
-from dataclasses import dataclass
-from typing import ClassVar, Optional
-
-import torch
-
-from vllm.attention.backends.abstract import (
- AttentionBackend,
- AttentionImpl,
- AttentionType,
- MultipleOf,
-)
-from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.v1.attention.backends.utils import (
- AttentionMetadataBuilder,
- CommonAttentionMetadata,
- split_decodes_and_prefills,
-)
-from vllm.v1.kv_cache_interface import AttentionSpec
-
-try:
- from xformers import ops as xops
- from xformers.ops.fmha.attn_bias import (
- AttentionBias,
- PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
- )
-
- XFORMERS_AVAILABLE = True
-except ImportError:
- XFORMERS_AVAILABLE = False
-
-from vllm import _custom_ops as ops
-
-logger = init_logger(__name__)
-
-
-class XFormersAttentionBackend(AttentionBackend):
- accept_output_buffer: bool = True
- supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-
- @staticmethod
- def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
- return [MultipleOf(16)]
-
- @classmethod
- def get_supported_head_sizes(cls) -> list[int]:
- return [
- 32,
- 40,
- 48,
- 56,
- 64,
- 72,
- 80,
- 88,
- 96,
- 104,
- 112,
- 120,
- 128,
- 136,
- 144,
- 152,
- 160,
- 168,
- 176,
- 184,
- 192,
- 200,
- 208,
- 216,
- 224,
- 232,
- 240,
- 248,
- 256,
- ]
-
- @staticmethod
- def get_name() -> str:
- return "XFORMERS"
-
- @staticmethod
- def get_impl_cls() -> type["XFormersAttentionImpl"]:
- return XFormersAttentionImpl
-
- @staticmethod
- def get_kv_cache_shape(
- num_blocks: int,
- block_size: int,
- num_kv_heads: int,
- head_size: int,
- cache_dtype_str: str = "auto",
- ) -> tuple[int, ...]:
- if block_size % 16 != 0:
- raise ValueError("Block size must be a multiple of 16.")
- return (2, num_blocks, block_size, num_kv_heads, head_size)
-
- @staticmethod
- def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]:
- return XFormersAttentionMetadataBuilder
-
- @staticmethod
- def use_cascade_attention(*args, **kwargs) -> bool:
- return False
-
-
-@dataclass
-class XFormersAttentionMetadata:
- num_actual_tokens: int # Number of tokens excluding padding.
- max_query_len: int
- query_start_loc: torch.Tensor
- max_seq_len: int
- seq_lens: torch.Tensor
- block_table: torch.Tensor
- slot_mapping: torch.Tensor
-
- num_prefill_tokens: int = 0
- num_decode_tokens: int = 0
- num_prefills: int = 0
- num_decodes: int = 0
-
- # Biases for different attention types.
- attn_bias: Optional["AttentionBias"] = None
-
- # Self-attention prefill/decode metadata cache
- _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None
- _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None
-
- @property
- def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]:
- if self.num_prefills == 0:
- return None
-
- if self._cached_prefill_metadata is not None:
- # Recover cached prefill-phase attention
- # metadata structure
- return self._cached_prefill_metadata
-
- q_start_loc = self.query_start_loc[self.num_decodes :]
- q_seqlens = torch.diff(q_start_loc)
- kv_seqlens = self.seq_lens[self.num_decodes :]
- # Construct & cache prefill-phase attention metadata structure
- self._cached_prefill_metadata = XFormersAttentionMetadata(
- num_actual_tokens=self.num_prefill_tokens,
- max_query_len=int(q_seqlens.max().item()),
- query_start_loc=q_start_loc - q_start_loc[0],
- max_seq_len=int(kv_seqlens.max().item()),
- seq_lens=kv_seqlens,
- block_table=self.block_table[self.num_decodes :],
- slot_mapping=self.slot_mapping[self.num_decode_tokens :],
- )
- return self._cached_prefill_metadata
-
- @property
- def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]:
- if self.num_decode_tokens == 0:
- return None
-
- if self._cached_decode_metadata is not None:
- # Recover cached decode-phase attention
- # metadata structure
- return self._cached_decode_metadata
-
- q_start_loc = self.query_start_loc
- q_seqlens = torch.diff(q_start_loc)
- decode_kv_seqlens = self.seq_lens[: self.num_decodes]
- # Construct & cache decode-phase attention metadata structure
- self._cached_decode_metadata = XFormersAttentionMetadata(
- num_actual_tokens=self.num_decode_tokens,
- max_query_len=int(q_seqlens[: self.num_decodes].max().item()),
- query_start_loc=q_start_loc[: self.num_decodes + 1],
- max_seq_len=int(decode_kv_seqlens.max().item()),
- seq_lens=decode_kv_seqlens,
- block_table=self.block_table[: self.num_decodes],
- slot_mapping=self.slot_mapping[: self.num_decode_tokens],
- attn_bias=self.attn_bias,
- )
- return self._cached_decode_metadata
-
-
-class XFormersAttentionMetadataBuilder(
- AttentionMetadataBuilder[XFormersAttentionMetadata]
-):
- reorder_batch_threshold: int = 1
-
- def __init__(
- self,
- kv_cache_spec: AttentionSpec,
- layer_names: list[str],
- vllm_config: VllmConfig,
- device: torch.device,
- ):
- super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-
- assert XFORMERS_AVAILABLE
- self.block_size = kv_cache_spec.block_size
- self._num_decodes = 0
- self._num_decode_tokens = 0
-
- def build(
- self,
- common_prefix_len: int,
- common_attn_metadata: CommonAttentionMetadata,
- fast_build: bool = False,
- ) -> XFormersAttentionMetadata:
- num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
- split_decodes_and_prefills(
- common_attn_metadata, decode_threshold=self.reorder_batch_threshold
- )
- )
-
- num_actual_tokens = common_attn_metadata.num_actual_tokens
- q_start_loc = common_attn_metadata.query_start_loc
- q_seqlens = torch.diff(q_start_loc)
- max_query_len = common_attn_metadata.max_query_len
- kv_seqlens = common_attn_metadata.seq_lens
- max_seq_len = common_attn_metadata.max_seq_len
- block_table = common_attn_metadata.block_table_tensor
- slot_mapping = common_attn_metadata.slot_mapping
-
- bias = None
- if num_decodes > 0:
- # Construct the decoder bias.
- decode_q_seqlens = q_seqlens[:num_decodes]
- decode_kv_seqlens = kv_seqlens[:num_decodes]
- bias = PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
- q_seqlen=decode_q_seqlens.tolist(),
- kv_seqlen=decode_kv_seqlens.tolist(),
- page_size=self.block_size,
- block_tables=block_table[:num_decodes],
- device=block_table.device,
- )
-
- return XFormersAttentionMetadata(
- num_actual_tokens=num_actual_tokens,
- num_prefill_tokens=num_prefill_tokens,
- num_decode_tokens=num_decode_tokens,
- num_prefills=num_prefills,
- num_decodes=num_decodes,
- max_query_len=max_query_len,
- query_start_loc=q_start_loc,
- max_seq_len=max_seq_len,
- seq_lens=kv_seqlens,
- block_table=block_table,
- slot_mapping=slot_mapping,
- attn_bias=bias,
- )
-
-
-class XFormersAttentionImpl(AttentionImpl):
- def __init__(
- self,
- num_heads: int,
- head_size: int,
- scale: float,
- num_kv_heads: int,
- alibi_slopes: list[float] | None,
- sliding_window: int | None,
- kv_cache_dtype: str,
- logits_soft_cap: float | None = None,
- attn_type: AttentionType = AttentionType.DECODER,
- kv_sharing_target_layer_name: str | None = None,
- ) -> None:
- if kv_sharing_target_layer_name is not None:
- raise NotImplementedError("KV sharing is not supported in V0.")
- if alibi_slopes is not None:
- raise NotImplementedError("XFormers does not support alibi slopes yet.")
- self.num_heads = num_heads
- self.head_size = head_size
- self.scale = float(scale)
- self.num_kv_heads = num_kv_heads
- self.num_queries_per_kv = self.num_heads // self.num_kv_heads
- self.kv_cache_dtype = kv_cache_dtype
- self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
- if alibi_slopes is not None:
- alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
- self.alibi_slopes = alibi_slopes
- if sliding_window is None:
- self.sliding_window = (-1, -1)
- else:
- self.sliding_window = (sliding_window - 1, 0)
- if logits_soft_cap is None:
- # Setting logits_soft_cap to 0 means no soft cap.
- logits_soft_cap = 0
- self.logits_soft_cap = logits_soft_cap
-
- if attn_type != AttentionType.DECODER:
- raise NotImplementedError(
- "Encoder self-attention and "
- "encoder/decoder cross-attention "
- "are not implemented for "
- "XFormersAttentionImpl."
- )
-
- def forward(
- self,
- layer: torch.nn.Module,
- query: torch.Tensor,
- key: torch.Tensor,
- value: torch.Tensor,
- kv_cache: torch.Tensor,
- attn_metadata: XFormersAttentionMetadata,
- output: torch.Tensor | None = None,
- output_scale: torch.Tensor | None = None,
- output_block_scale: torch.Tensor | None = None,
- ) -> torch.Tensor:
- """Forward pass with XFormers.
-
- Args:
- query: shape = [num_tokens, num_heads, head_size]
- key: shape = [num_tokens, num_kv_heads, head_size]
- value: shape = [num_tokens, num_kv_heads, head_size]
- kv_cache: shape =
- [2, num_blocks, block_size, num_kv_heads, head_size]
- attn_metadata: Metadata for attention.
- Returns:
- shape = [num_tokens, num_heads * head_size]
- """
- assert output is not None, "Output tensor must be provided."
-
- if output_scale is not None or output_block_scale is not None:
- raise NotImplementedError(
- "fused output quantization is not yet supported"
- " for XFormersAttentionImpl"
- )
-
- if attn_metadata is None:
- # Profiling run.
- return output.fill_(0)
-
- # Cache the input KVs.
- key_cache, value_cache = kv_cache.unbind(0)
- if self.kv_sharing_target_layer_name is None:
- # Reshape the input keys and values and store them in the cache.
- # Skip this if sharing KV cache with an earlier attention layer.
- # NOTE(woosuk): Here, key and value are padded while slot_mapping is
- # not padded. However, we don't need to do key[:num_actual_tokens]
- # and value[:num_actual_tokens] because the reshape_and_cache_flash
- # op uses the slot_mapping's shape to determine the number of
- # actual tokens.
- ops.reshape_and_cache_flash(
- key,
- value,
- key_cache,
- value_cache,
- attn_metadata.slot_mapping,
- self.kv_cache_dtype,
- layer._k_scale,
- layer._v_scale,
- )
-
- num_actual_tokens = attn_metadata.num_actual_tokens
- num_decode_tokens = attn_metadata.num_decode_tokens
- if prefill_meta := attn_metadata.prefill_metadata:
- descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, key.shape[1])
- unified_attention(
- q=query[num_decode_tokens:num_actual_tokens],
- k=key_cache,
- v=value_cache,
- out=output[num_decode_tokens:num_actual_tokens],
- cu_seqlens_q=prefill_meta.query_start_loc,
- max_seqlen_q=prefill_meta.max_query_len,
- seqused_k=prefill_meta.seq_lens,
- max_seqlen_k=prefill_meta.max_seq_len,
- softmax_scale=self.scale,
- causal=True,
- alibi_slopes=self.alibi_slopes,
- window_size=self.sliding_window,
- block_table=prefill_meta.block_table,
- softcap=self.logits_soft_cap,
- q_descale=None, # Not supported
- k_descale=layer._k_scale.expand(descale_shape),
- v_descale=layer._v_scale.expand(descale_shape),
- )
-
- if decode_meta := attn_metadata.decode_metadata:
- # Query for decode. KV is not needed because it is already cached.
- decode_query = query[:num_decode_tokens]
- # Reshape query to [1, B_T, G, H, D].
- q = decode_query.view(
- 1, -1, self.num_kv_heads, self.num_queries_per_kv, self.head_size
- )
- # Reshape the k and v caches to [1, Bkv_T, G, H, D]
- cache_k = key_cache.view(
- 1, -1, self.num_kv_heads, 1, self.head_size
- ).expand(
- 1,
- -1,
- self.num_kv_heads,
- self.num_queries_per_kv,
- self.head_size,
- )
- cache_v = value_cache.view(
- 1, -1, self.num_kv_heads, 1, self.head_size
- ).expand(
- 1,
- -1,
- self.num_kv_heads,
- self.num_queries_per_kv,
- self.head_size,
- )
-
- attn_bias = decode_meta.attn_bias
- output[:num_decode_tokens] = xops.memory_efficient_attention_forward(
- q,
- cache_k,
- cache_v,
- attn_bias=attn_bias,
- p=0.0,
- scale=self.scale,
- ).view(decode_query.shape)
-
- # Reshape the output tensor.
- return output
From ed40d85929f25a939b8d79ef3f2fb923f5a7bb54 Mon Sep 17 00:00:00 2001
From: Fanli Lin
Date: Mon, 24 Nov 2025 14:48:45 +0800
Subject: [PATCH 002/134] [BugFix] Fix R-VL model loading error (#29299)
Signed-off-by: Lin, Fanli
---
examples/offline_inference/vision_language_multi_image.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index d6e169548f15b..301265d4e17f7 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1110,6 +1110,7 @@ def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model=model_name,
max_model_len=16384,
max_num_seqs=16,
+ trust_remote_code=True,
limit_mm_per_prompt={"image": len(image_urls)},
)
From 68dfe28eaefc20ce8d847c3b3ccf712716d20c20 Mon Sep 17 00:00:00 2001
From: "rongfu.leng"
Date: Mon, 24 Nov 2025 18:02:28 +0800
Subject: [PATCH 003/134] [Feature][Benchmark] add --link-vars can filter when
serve_param equal bench_param (#28909)
Signed-off-by: rongfu.leng
---
vllm/benchmarks/sweep/serve.py | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 45ac446a7aedf..1298e4acbd87d 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -211,6 +211,7 @@ def run_combs(
output_dir: Path,
num_runs: int,
dry_run: bool,
+ links: list[tuple[str, str]],
):
all_data = list[dict[str, object]]()
for serve_comb in serve_params:
@@ -226,6 +227,14 @@ def run_combs(
else contextlib.nullcontext()
) as server:
for bench_comb in bench_params:
+ should_run = all(
+ serve_key in serve_comb
+ and bench_key in bench_comb
+ and serve_comb[serve_key] == bench_comb[bench_key]
+ for serve_key, bench_key in links
+ )
+ if not should_run:
+ continue
base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
comb_data = run_comb(
@@ -262,6 +271,7 @@ class SweepServeArgs:
num_runs: int
dry_run: bool
resume: str | None
+ link_vars: list[tuple[str, str]] | None
parser_name: ClassVar[str] = "serve"
parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings."
@@ -285,7 +295,7 @@ class SweepServeArgs:
else:
# i.e.: run bench_cmd without any modification
bench_params = ParameterSweep.from_records([{}])
-
+ link_vars = cls.parse_link_vars(args.link_vars)
num_runs = args.num_runs
if num_runs < 1:
raise ValueError("`num_runs` should be at least 1.")
@@ -301,6 +311,7 @@ class SweepServeArgs:
num_runs=num_runs,
dry_run=args.dry_run,
resume=args.resume,
+ link_vars=link_vars,
)
@classmethod
@@ -376,8 +387,28 @@ class SweepServeArgs:
"parameter combinations for which there are still no output files.",
)
+ parser.add_argument(
+ "--link-vars",
+ type=str,
+ default="",
+ help=(
+ "Comma-separated list of linked variables between serve and bench, "
+ "e.g. max_num_seqs=max_concurrency,max_model_len=random_input_len"
+ ),
+ )
+
return parser
+ @staticmethod
+ def parse_link_vars(s: str) -> list[tuple[str, str]]:
+ if not s:
+ return []
+ pairs = []
+ for item in s.split(","):
+ a, b = item.split("=")
+ pairs.append((a.strip(), b.strip()))
+ return pairs
+
def run_main(args: SweepServeArgs):
timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -397,6 +428,7 @@ def run_main(args: SweepServeArgs):
output_dir=output_dir,
num_runs=args.num_runs,
dry_run=args.dry_run,
+ links=args.link_vars,
)
except BaseException as exc:
raise RuntimeError(
From 8005e606bf280b7b6002f57e95ae3210ddc6f041 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=B0=E5=85=AE?=
<38908462+zhyajie@users.noreply.github.com>
Date: Mon, 24 Nov 2025 18:16:52 +0800
Subject: [PATCH 004/134] [Bugfix][Rocm] Fix shared expert weight loading
failure in DeepSeek-MTP (#27563)
Signed-off-by: zhyajie
Co-authored-by: zhyajie
---
vllm/model_executor/models/deepseek_mtp.py | 153 +++++++++++++++------
vllm/model_executor/models/deepseek_v2.py | 14 +-
2 files changed, 121 insertions(+), 46 deletions(-)
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index e028dc497aa6a..6e23037b919ab 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,15 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
+import typing
+from collections.abc import Callable, Iterable
import torch
import torch.nn as nn
from transformers import PretrainedConfig
+from vllm._aiter_ops import rocm_aiter_ops
from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -231,6 +233,9 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
return self.model.compute_logits(hidden_states, spec_step_idx)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+ rocm_aiter_moe_shared_expert_enabled = (
+ rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+ )
stacked_params_mapping = [
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
@@ -238,11 +243,16 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
]
- expert_params_mapping = FusedMoE.make_expert_params_mapping(
+ expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
ckpt_gate_proj_name="gate_proj",
ckpt_down_proj_name="down_proj",
ckpt_up_proj_name="up_proj",
- num_experts=self.config.n_routed_experts,
+ num_experts=self.config.n_routed_experts
+ + (
+ self.config.n_shared_experts
+ if rocm_aiter_moe_shared_expert_enabled
+ else 0
+ ),
)
params_dict = dict(self.named_parameters())
@@ -253,6 +263,9 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
if spec_layer is None:
continue
+ is_fusion_moe_shared_experts_layer = (
+ rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
+ )
name = self._rewrite_spec_layer_name(spec_layer, name)
for param_name, weight_name, shard_id in stacked_params_mapping:
# Skip non-stacked layers and experts (experts handled below).
@@ -266,6 +279,8 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
if ("mlp.experts." in name) and name not in params_dict:
continue
+ if is_fusion_moe_shared_experts_layer:
+ continue
name_mapped = name.replace(weight_name, param_name)
# QKV fusion is optional, fall back to normal
@@ -286,45 +301,105 @@ class DeepSeekMTP(nn.Module, SupportsPP, DeepseekV2MixtureOfExperts):
weight_loader(param, loaded_weight, shard_id)
break
else:
- for mapping in expert_params_mapping:
- param_name, weight_name, expert_id, shard_id = mapping
- if weight_name not in name:
- continue
- name = name.replace(weight_name, param_name)
-
- param = params_dict[name]
- weight_loader = param.weight_loader
- weight_loader(
- param,
- loaded_weight,
- name,
- shard_id=shard_id,
- expert_id=expert_id,
+ # Special handling: when AITER fusion_shared_experts is enabled,
+ # checkpoints may provide a single widened shared_experts tensor
+ # without explicit expert indices
+ # (e.g. ...mlp.shared_experts.gate_proj.weight).
+ # For models with multiple shared experts, split that tensor
+ # evenly into per-shared-expert slices and load them into
+ # appended expert slots mlp.experts.{n_routed_experts + j}.*
+ # accordingly.
+ num_chunks = 1
+ if is_fusion_moe_shared_experts_layer:
+ num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
+ # Determine split axis based on op type
+ # gate/up: ColumnParallel → split along dim 0
+ # down: RowParallel → split along dim 1
+ split_dim = 1 if "down_proj.weight" in name else 0
+ total = loaded_weight.shape[split_dim]
+ assert total % num_chunks == 0, (
+ f"Shared expert weight dim {total} "
+ f"not divisible by num_chunks {num_chunks}"
)
- break
- else:
- # Skip loading extra bias for GPTQ models.
- if name.endswith(".bias") and name not in params_dict:
- continue
+ chunk_size = total // num_chunks
- name = maybe_remap_kv_scale_name(name, params_dict)
- if name is None:
- continue
+ for j in range(num_chunks):
+ chunk_name = name
+ weight_to_load = loaded_weight
- # According to DeepSeek-V3 Technical Report, MTP modules
- # shares embedding layer. We only load the first weights.
- if (
- spec_layer != self.model.mtp_start_layer_idx
- and ".layers" not in name
- ):
- continue
+ if is_fusion_moe_shared_experts_layer:
+ if split_dim == 0:
+ weight_to_load = loaded_weight[
+ j * chunk_size : (j + 1) * chunk_size, :
+ ]
+ else:
+ weight_to_load = loaded_weight[
+ :, j * chunk_size : (j + 1) * chunk_size
+ ]
+ # Synthesize an expert-style name so expert mapping
+ # can route it
+ chunk_name = name.replace(
+ "mlp.shared_experts",
+ f"mlp.experts.{self.config.n_routed_experts + j}",
+ )
- param = params_dict[name]
- weight_loader = getattr(
- param, "weight_loader", default_weight_loader
- )
- weight_loader(param, loaded_weight)
- loaded_params.add(name)
+ # Use expert_params_mapping to locate the destination
+ # param and delegate to its expert-aware weight_loader
+ # with expert_id.
+ for mapping in expert_params_mapping:
+ param_name, weight_name, expert_id, shard_id = mapping
+ if weight_name not in chunk_name:
+ continue
+
+ # Do not modify `name` since the loop may continue here
+ # Instead, create a new variable
+ name_mapped = chunk_name.replace(weight_name, param_name)
+
+ param = params_dict[name_mapped]
+ # We should ask the weight loader to return success or
+ # not here since otherwise we may skip experts with
+ # other available replicas.
+ weight_loader = typing.cast(
+ Callable[..., bool], param.weight_loader
+ )
+ success = weight_loader(
+ param,
+ weight_to_load,
+ name_mapped,
+ shard_id=shard_id,
+ expert_id=expert_id,
+ return_success=True,
+ )
+ if success:
+ if not is_fusion_moe_shared_experts_layer:
+ name = name_mapped
+ else:
+ loaded_params.add(name_mapped)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+
+ name = maybe_remap_kv_scale_name(name, params_dict)
+ if name is None:
+ continue
+
+ # According to DeepSeek-V3 Technical Report, MTP modules
+ # shares embedding layer. We only load the first weights.
+ if (
+ spec_layer != self.model.mtp_start_layer_idx
+ and ".layers" not in name
+ ):
+ continue
+
+ param = params_dict[name]
+ weight_loader = getattr(
+ param, "weight_loader", default_weight_loader
+ )
+ weight_loader(param, loaded_weight)
+ if not is_fusion_moe_shared_experts_layer:
+ loaded_params.add(name)
return loaded_params
def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 7cfd381592b49..ad932559b983d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -1479,8 +1479,8 @@ class DeepseekV2ForCausalLM(
if spec_layer is not None:
continue # skip spec decode layers for main model
- is_fuse_shared_experts_layer = rocm_aiter_moe_shared_expert_enabled and (
- "mlp.shared_experts" in name
+ is_fusion_moe_shared_experts_layer = (
+ rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
)
for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -1495,7 +1495,7 @@ class DeepseekV2ForCausalLM(
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
if ("mlp.experts." in name) and name not in params_dict:
continue
- if is_fuse_shared_experts_layer:
+ if is_fusion_moe_shared_experts_layer:
continue
name_mapped = name.replace(weight_name, param_name)
@@ -1531,7 +1531,7 @@ class DeepseekV2ForCausalLM(
# appended expert slots mlp.experts.{n_routed_experts + j}.*
# accordingly.
num_chunks = 1
- if is_fuse_shared_experts_layer:
+ if is_fusion_moe_shared_experts_layer:
num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
# Determine split axis based on op type
# gate/up: ColumnParallel → split along dim 0
@@ -1548,7 +1548,7 @@ class DeepseekV2ForCausalLM(
chunk_name = name
weight_to_load = loaded_weight
- if is_fuse_shared_experts_layer:
+ if is_fusion_moe_shared_experts_layer:
if split_dim == 0:
weight_to_load = loaded_weight[
j * chunk_size : (j + 1) * chunk_size, :
@@ -1599,7 +1599,7 @@ class DeepseekV2ForCausalLM(
return_success=True,
)
if success:
- if not is_fuse_shared_experts_layer:
+ if not is_fusion_moe_shared_experts_layer:
name = name_mapped
else:
loaded_params.add(name_mapped)
@@ -1628,7 +1628,7 @@ class DeepseekV2ForCausalLM(
param, "weight_loader", default_weight_loader
)
weight_loader(param, loaded_weight)
- if not is_fuse_shared_experts_layer:
+ if not is_fusion_moe_shared_experts_layer:
loaded_params.add(name)
return loaded_params
From eca7a8fb59c223c603922be0bd62f5c460972a50 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Mon, 24 Nov 2025 12:10:48 +0100
Subject: [PATCH 005/134] [Doc]: fix typos in various files (#29230)
Signed-off-by: Didier Durand
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
benchmarks/kernels/deepgemm/README.md | 2 +-
vllm/config/vllm.py | 2 +-
vllm/forward_context.py | 2 +-
vllm/v1/worker/gpu_input_batch.py | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
index 41e68e047be82..a28c6956be0e9 100644
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -2,7 +2,7 @@
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
-Currently this just includes dense GEMMs and only works on Hopper GPUs.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.
## Setup
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index d64e315b4fe39..8a3599416bc72 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -96,7 +96,7 @@ class VllmConfig:
"""`torch.compile` and cudagraph capture configuration for the model.
As a shorthand, one can append compilation arguments via
- -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).
+ -0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`).
You can specify the full compilation config like so:
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 25fb7181a8f29..7cb490e391abb 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -153,7 +153,7 @@ class DPMetadata:
@contextmanager
def sp_local_sizes(self, sequence_parallel_size: int):
"""
- Context mamager for setting self.local_sizes. Same as self.chunked_sizes
+ Context manager for setting self.local_sizes. Same as self.chunked_sizes
but without any chunking.
"""
self.local_sizes = _compute_sp_num_tokens(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d6fef450c028a..4a2818ab1bfd8 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -525,7 +525,7 @@ class InputBatch:
# NOTE: the following is unsafe
# self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
# self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
- # instead, we need to temporiarily copy the data for one of the indices
+ # instead, we need to temporarily copy the data for one of the indices
# TODO(lucas): optimize this by only copying valid indices
tmp = self.token_ids_cpu[i1, ...].copy()
self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
From 4de87866a8fdc9395ccd40e00c0f9075439b07ae Mon Sep 17 00:00:00 2001
From: R3hankhan
Date: Mon, 24 Nov 2025 17:38:09 +0530
Subject: [PATCH 006/134] [CPU][IBM Z] Fix BF16 support and vectorize math
operations for s390x (#28926)
Signed-off-by: Rehan Khan
---
csrc/cpu/cpu_attn_impl.hpp | 2 +-
csrc/cpu/cpu_types_vxe.hpp | 586 +++++++++++++++++++++++++++++++++----
2 files changed, 531 insertions(+), 57 deletions(-)
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 12c6f5d3015cc..98f55d7c014be 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -847,7 +847,7 @@ struct VecTypeTrait {
};
#endif
-#if !defined(__powerpc__)
+#if !defined(__powerpc__) && !defined(__s390x__)
template <>
struct VecTypeTrait {
using vec_t = vec_op::FP16Vec16;
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index 51bca37e699b9..9efd8b7ec14a4 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -4,6 +4,7 @@
#include
#include
+#include
#include
namespace vec_op {
@@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec {
}
explicit FP32Vec8(const BF16Vec8& v) {
- reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
- reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+ // On big-endian s390x, place BF16 first to get correct byte order
+ reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
+ reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
}
float reduce_sum() const {
@@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec {
}
FP32Vec8 exp() const {
- // TODO: Vectorize this
- AliasReg ar;
- ar.reg = reg;
- f32x4x4_t ret;
- ret.val[0][0] = std::exp(ar.values[0]);
- ret.val[0][1] = std::exp(ar.values[1]);
- ret.val[0][2] = std::exp(ar.values[2]);
- ret.val[0][3] = std::exp(ar.values[3]);
- ret.val[1][0] = std::exp(ar.values[4]);
- ret.val[1][1] = std::exp(ar.values[5]);
- ret.val[1][2] = std::exp(ar.values[6]);
- ret.val[1][3] = std::exp(ar.values[7]);
- return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+ f32x4x2_t out;
+
+ const __vector float log2e = vec_splats(1.44269504088896341f);
+ const __vector float one = vec_splats(1.0f);
+ const __vector float min_x = vec_splats(-87.3f);
+ const __vector float max_x = vec_splats(88.7f);
+
+ // 5th-degree minimax polynomial for 2^r (r in [0,1))
+ const __vector float c1 = vec_splats(0.6931471805599453f);
+ const __vector float c2 = vec_splats(0.240226506959101f);
+ const __vector float c3 = vec_splats(0.05550410866482158f);
+ const __vector float c4 = vec_splats(0.009618129107628477f);
+ const __vector float c5 = vec_splats(0.0013333558146428443f);
+
+ for (int i = 0; i < 2; i++) {
+ __vector float x = reg.val[i];
+
+ x = vec_max(x, min_x);
+ x = vec_min(x, max_x);
+
+ __vector float y = vec_mul(x, log2e);
+
+ __vector float kf = vec_floor(y);
+ __vector float r = vec_sub(y, kf);
+
+ __vector signed int k = vec_signed(kf);
+ const __vector signed int min_k = vec_splats((signed int)-126);
+ const __vector signed int max_k = vec_splats((signed int)127);
+ k = vec_min(vec_max(k, min_k), max_k);
+
+ // Build 2^k from exponent bits
+ __vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
+ __vector unsigned int bits = (__vector unsigned int)exp_int;
+ bits = vec_sl(bits, vec_splats((unsigned int)23));
+ __vector float pow2k = (__vector float)bits;
+
+ // Improved minimax polynomial
+ __vector float poly = vec_madd(c5, r, c4);
+ poly = vec_madd(poly, r, c3);
+ poly = vec_madd(poly, r, c2);
+ poly = vec_madd(poly, r, c1);
+ poly = vec_madd(poly, r, one);
+
+ out.val[i] = vec_mul(pow2k, poly);
+ }
+
+ return FP32Vec8(out);
}
FP32Vec8 tanh() const {
- // TODO: Vectorize this
- AliasReg ar;
- ar.reg = reg;
- f32x4x4_t ret;
- ret.val[0][0] = std::tanh(ar.values[0]);
- ret.val[0][1] = std::tanh(ar.values[1]);
- ret.val[0][2] = std::tanh(ar.values[2]);
- ret.val[0][3] = std::tanh(ar.values[3]);
- ret.val[1][0] = std::tanh(ar.values[4]);
- ret.val[1][1] = std::tanh(ar.values[5]);
- ret.val[1][2] = std::tanh(ar.values[6]);
- ret.val[1][3] = std::tanh(ar.values[7]);
- return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+ // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+ const __vector float one = vec_splats(1.0f);
+ const __vector float two = vec_splats(2.0f);
+ const __vector float zero = vec_splats(0.0f);
+ const __vector float sat =
+ vec_splats(9.0f); // beyond this, tanh(x) ~ sign(x)
+
+ f32x4x2_t out;
+
+ for (int i = 0; i < 2; i++) {
+ __vector float x = reg.val[i];
+ __vector float ax = vec_abs(x);
+
+ // sign(x): +1 or -1
+ __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+ // saturation mask: |x| > sat
+ __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+ // 2x
+ __vector float two_x = vec_mul(x, two);
+
+ // Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
+ f32x4x2_t tmp;
+ tmp.val[0] = two_x;
+ tmp.val[1] = two_x;
+ FP32Vec8 exp_2x_vec(tmp);
+
+ FP32Vec8 e2x = exp_2x_vec.exp();
+ __vector float e = e2x.reg.val[i];
+
+ // tanh(x) = (e - 1) / (e + 1)
+ __vector float num = vec_sub(e, one);
+ __vector float den = vec_add(e, one);
+
+ __vector float t = vec_div(num, den);
+
+ // For large |x|, clamp to sign(x)
+ out.val[i] = vec_sel(t, sign, saturated);
+ }
+
+ return FP32Vec8(out);
}
FP32Vec8 er() const {
- // TODO: Vectorize this
- AliasReg ar;
- ar.reg = reg;
- f32x4x4_t ret;
- ret.val[0][0] = std::erf(ar.values[0]);
- ret.val[0][1] = std::erf(ar.values[1]);
- ret.val[0][2] = std::erf(ar.values[2]);
- ret.val[0][3] = std::erf(ar.values[3]);
- ret.val[1][0] = std::erf(ar.values[4]);
- ret.val[1][1] = std::erf(ar.values[5]);
- ret.val[1][2] = std::erf(ar.values[6]);
- ret.val[1][3] = std::erf(ar.values[7]);
- return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+ // A&S 7.1.26 approximation:
+ // erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
+ // exp(-x^2)) t = 1 / (1 + p*|x|), p = 0.3275911
+
+ const __vector float one = vec_splats(1.0f);
+ const __vector float zero = vec_splats(0.0f);
+ const __vector float p = vec_splats(0.3275911f);
+
+ // Polynomial coeffs
+ const __vector float a1 = vec_splats(0.254829592f);
+ const __vector float a2 = vec_splats(-0.284496736f);
+ const __vector float a3 = vec_splats(1.421413741f);
+ const __vector float a4 = vec_splats(-1.453152027f);
+ const __vector float a5 = vec_splats(1.061405429f);
+
+ // Threshold where erf(x) ~ sign(x)
+ const __vector float sat = vec_splats(6.0f);
+
+ f32x4x2_t out;
+
+ for (int lane = 0; lane < 2; lane++) {
+ __vector float x = reg.val[lane];
+ __vector float ax = vec_abs(x);
+
+ // sign(x)
+ __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+ // |x| > 6 → erf(x) = ±1
+ __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+ // t = 1 / (1 + p * |x|)
+ __vector float t = vec_madd(p, ax, one);
+ t = vec_div(one, t);
+
+ // poly = a5
+ __vector float poly = a5;
+ poly = vec_madd(poly, t, a4);
+ poly = vec_madd(poly, t, a3);
+ poly = vec_madd(poly, t, a2);
+ poly = vec_madd(poly, t, a1);
+
+ // full polynomial: poly = poly * t
+ poly = vec_mul(poly, t);
+
+ // Compute exp(-x^2)
+ __vector float x2 = vec_mul(x, x);
+ __vector float neg_x2 = vec_neg(x2);
+
+ f32x4x2_t tmp;
+ tmp.val[0] = neg_x2;
+ tmp.val[1] = neg_x2;
+ FP32Vec8 exp_neg_x2(tmp);
+
+ FP32Vec8 e = exp_neg_x2.exp();
+ __vector float ex = e.reg.val[lane];
+
+ // erf(x) = sign * (1 - poly * exp(-x^2))
+ __vector float term = vec_mul(poly, ex);
+ __vector float y = vec_sub(one, term);
+ y = vec_mul(y, sign);
+
+ // saturated → ±1
+ __vector float sat_val = vec_mul(sign, one);
+ out.val[lane] = vec_sel(y, sat_val, saturated);
+ }
+
+ return FP32Vec8(out);
+ }
+ // Elementwise sigmoid(x) = 1 / (1 + exp(-x))
+ FP32Vec8 sigmoid() const {
+ const __vector float one = vec_splats(1.0f);
+
+ f32x4x2_t neg;
+ for (int i = 0; i < 2; ++i) {
+ neg.val[i] = vec_neg(reg.val[i]);
+ }
+
+ FP32Vec8 neg_x(neg);
+ FP32Vec8 e = neg_x.exp(); // exp(-x)
+
+ f32x4x2_t denom;
+ for (int i = 0; i < 2; ++i) {
+ denom.val[i] = vec_add(one, e.reg.val[i]);
+ }
+
+ FP32Vec8 denom_vec(denom);
+ FP32Vec8 one_vec(1.0f);
+
+ return one_vec / denom_vec;
+ }
+
+ // Tanh-based GELU:
+ // gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
+ FP32Vec8 gelu_tanh() const {
+ const __vector float k_s2pi = vec_splats(0.7978845608028654f); // √(2/π)
+ const __vector float k_0_0447 = vec_splats(0.044715f);
+
+ f32x4x2_t x2, x3, inner;
+ for (int i = 0; i < 2; ++i) {
+ __vector float x = reg.val[i];
+ x2.val[i] = vec_mul(x, x); // x^2
+ x3.val[i] = vec_mul(x2.val[i], x); // x^3
+ __vector float t = vec_madd(k_0_0447, x3.val[i], x); // x + 0.044715*x^3
+ inner.val[i] = vec_mul(k_s2pi, t); // √(2/π)*(...)
+ }
+
+ FP32Vec8 inner_vec(inner);
+ FP32Vec8 t = inner_vec.tanh(); // tanh part
+
+ FP32Vec8 one_vec(1.0f);
+ FP32Vec8 half_vec(0.5f);
+
+ FP32Vec8 x_vec(*this);
+ return x_vec * half_vec * (one_vec + t);
+ }
+
+ // Erf-based GELU:
+ // gelu(x) = 0.5 * x * (1 + erf(x / √2))
+ FP32Vec8 gelu_erf() const {
+ const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f); // 1/√2
+ FP32Vec8 x_vec(*this);
+
+ f32x4x2_t scaled;
+ for (int i = 0; i < 2; ++i) {
+ scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
+ }
+ FP32Vec8 x_scaled(scaled);
+
+ FP32Vec8 erf_x = x_scaled.er();
+
+ FP32Vec8 one_vec(1.0f);
+ FP32Vec8 half_vec(0.5f);
+
+ return x_vec * half_vec * (one_vec + erf_x);
+ }
+
+ // Elementwise reciprocal: 1/x (scalar per lane, for correctness)
+ FP32Vec8 rcp() const {
+ AliasReg in, out;
+ in.reg = reg;
+
+ for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+ out.values[i] = 1.0f / in.values[i];
+ }
+ return FP32Vec8(out.reg);
+ }
+
+ // Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
+ FP32Vec8 rsqrt() const {
+ AliasReg in, out;
+ in.reg = reg;
+
+ for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+ out.values[i] = 1.0f / std::sqrt(in.values[i]);
+ }
+ return FP32Vec8(out.reg);
}
FP32Vec8 operator*(const FP32Vec8& b) const {
@@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec {
}
explicit FP32Vec16(const BF16Vec16& v) {
- reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
- reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
- reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
- reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+ // On big-endian s390x, place BF16 first to get correct byte order
+ reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
+ reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
+ reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
+ reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
}
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec {
return result;
}
+ FP32Vec16 max(const FP32Vec16& b) const {
+ return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
+ vec_max(reg.val[1], b.reg.val[1]),
+ vec_max(reg.val[2], b.reg.val[2]),
+ vec_max(reg.val[3], b.reg.val[3])}));
+ }
+
+ float reduce_max() const {
+ AliasReg ar;
+ ar.reg = reg;
+ float result = ar.values[0];
+ unroll_loop([&result, &ar](int i) {
+ if (ar.values[i] > result) result = ar.values[i];
+ });
+ return result;
+ }
+
void save(float* ptr) const {
vec_xst(reg.val[0], 0, ptr);
vec_xst(reg.val[1], 16, ptr);
@@ -402,15 +628,14 @@ struct VecType {
using vec_type = BF16Vec8;
};
+// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
+using FP16Vec16 = FP32Vec16;
+
template
void storeFP32(float v, T* ptr) {
*ptr = v;
}
-inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
- acc = acc + a * b;
-}
-
namespace c10 {
struct BFloat16 {
uint16_t value; // Assume BFloat16 is defined as a struct containing a 16-bit
@@ -429,6 +654,79 @@ inline void storeFP32(float v, c10::BFloat16* ptr) {
#define __VEC_CLASS_FP_NAN (1 << 6)
#endif
+// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
+// intrinsics
+
+// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
+FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+ acc.reg = vec_madd(a.reg, b.reg, acc.reg);
+}
+
+// FP32Vec8 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+ acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+ acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+// FP32Vec16 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+ acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+ acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+ acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+ acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Multiply-Subtract: acc = acc - (a * b)
+FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+ acc.reg = vec_msub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+ acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+ acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+ acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+ acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+ acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+ acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Add: acc = -(a * b) + acc
+FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+ acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+ acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+ acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+ acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+ acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+ acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+ acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Subtract: acc = -(a * b) - acc
+FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+ acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+ acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+ acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+ acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+ acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+ acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+ acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
const static __vector unsigned char omask = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
@@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+ __vector unsigned int lsb0 = inp0 >> sh16;
+ __vector unsigned int lsb1 = inp1 >> sh16;
+ lsb0 = lsb0 & one;
+ lsb1 = lsb1 & one;
+ __vector unsigned int rnd0 = lsb0 + bias;
+ __vector unsigned int rnd1 = lsb1 + bias;
+ inp0 = inp0 + rnd0;
+ inp1 = inp1 + rnd1;
int cc;
__vector __bool int sel0 =
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel1 =
vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
- inp0 = vec_sel(inp0, nan, sel0) >> sh16;
- inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+ inp0 = vec_sel(inp0, nan, sel0);
+ inp1 = vec_sel(inp1, nan, sel1);
+ inp0 = inp0 >> sh16;
+ inp1 = inp1 >> sh16;
+
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
}
@@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+ __vector unsigned int lsb0 = inp0 >> sh16;
+ __vector unsigned int lsb1 = inp1 >> sh16;
+ __vector unsigned int lsb2 = inp2 >> sh16;
+ __vector unsigned int lsb3 = inp3 >> sh16;
+ lsb0 = lsb0 & one;
+ lsb1 = lsb1 & one;
+ lsb2 = lsb2 & one;
+ lsb3 = lsb3 & one;
+ __vector unsigned int rnd0 = lsb0 + bias;
+ __vector unsigned int rnd1 = lsb1 + bias;
+ __vector unsigned int rnd2 = lsb2 + bias;
+ __vector unsigned int rnd3 = lsb3 + bias;
+ inp0 = inp0 + rnd0;
+ inp1 = inp1 + rnd1;
+ inp2 = inp2 + rnd2;
+ inp3 = inp3 + rnd3;
int cc;
__vector __bool int sel0 =
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
@@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel3 =
vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
- inp0 = vec_sel(inp0, nan, sel0) >> sh16;
- inp1 = vec_sel(inp1, nan, sel1) >> sh16;
- inp2 = vec_sel(inp2, nan, sel2) >> sh16;
- inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+ inp0 = vec_sel(inp0, nan, sel0);
+ inp1 = vec_sel(inp1, nan, sel1);
+ inp2 = vec_sel(inp2, nan, sel2);
+ inp3 = vec_sel(inp3, nan, sel3);
+ inp0 = inp0 >> sh16;
+ inp1 = inp1 >> sh16;
+ inp2 = inp2 >> sh16;
+ inp3 = inp3 >> sh16;
+
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
}
-inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+// 1D softmax over `n` elements in `input`, writes result to `output`.
+// Uses FP32Vec8 for main body, scalar tail handling.
+// Requirement: n > 0
+FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
+ if (n <= 0) return;
+
+ // ---------- Pass 1: find max ----------
+ float max_val = -std::numeric_limits::infinity();
+ int i = 0;
+
+ for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+ FP32Vec8 v(input + i);
+ FP32Vec8::AliasReg ar;
+ ar.reg = v.reg;
+ for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+ if (ar.values[j] > max_val) max_val = ar.values[j];
+ }
+ }
+ for (; i < n; ++i) {
+ if (input[i] > max_val) max_val = input[i];
+ }
+
+ // ---------- Pass 2: compute exp(x - max) and sum ----------
+ float sum = 0.0f;
+ i = 0;
+
+ for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+ float tmp[FP32Vec8::VEC_ELEM_NUM];
+ for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+ tmp[j] = input[i + j] - max_val;
+ }
+
+ FP32Vec8 v(tmp);
+ FP32Vec8 e = v.exp();
+
+ FP32Vec8::AliasReg ar;
+ ar.reg = e.reg;
+ for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+ output[i + j] = ar.values[j];
+ sum += ar.values[j];
+ }
+ }
+
+ // Tail
+ for (; i < n; ++i) {
+ float x = input[i] - max_val;
+ float ex = std::exp(x); // scalar tail
+ output[i] = ex;
+ sum += ex;
+ }
+
+ // ---------- Pass 3: normalize ----------
+ float inv_sum = 1.0f / sum;
+ i = 0;
+
+ for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+ float tmp[FP32Vec8::VEC_ELEM_NUM];
+ for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+ tmp[j] = output[i + j] * inv_sum;
+ }
+ FP32Vec8 v(tmp);
+ v.save(output + i);
+ }
+
+ for (; i < n; ++i) {
+ output[i] *= inv_sum;
+ }
+}
+
+// 1D RMSNorm kernel:
+// input: x[0..n-1]
+// weight: w[0..n-1] (gamma), may be nullptr
+// output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
+// eps: small epsilon for numerical stability
+FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
+ const float* weight, int n, float eps) {
+ if (n <= 0) return;
+
+ // ---------- Pass 1: compute sum of squares ----------
+ float sum_sq = 0.0f;
+ int i = 0;
+
+ for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+ FP32Vec8 x_vec(input + i);
+
+ FP32Vec8 sq = x_vec * x_vec;
+
+ FP32Vec8::AliasReg ar;
+ ar.reg = sq.reg;
+ for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+ sum_sq += ar.values[j];
+ }
+ }
+
+ // Tail
+ for (; i < n; ++i) {
+ float v = input[i];
+ sum_sq += v * v;
+ }
+
+ float mean_sq = sum_sq / static_cast(n);
+ float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
+
+ // ---------- Pass 2: scale (and apply weight if given) ----------
+ const float inv_rms_f = inv_rms;
+ i = 0;
+
+ if (weight) {
+ // with gamma
+ for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+ FP32Vec8 x_vec(input + i);
+
+ float wtmp[FP32Vec8::VEC_ELEM_NUM];
+ for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+ wtmp[j] = weight[i + j];
+ }
+ FP32Vec8 w_vec(wtmp);
+
+ FP32Vec8 scale_vec(inv_rms_f);
+ FP32Vec8 y = x_vec * scale_vec * w_vec;
+ y.save(output + i);
+ }
+
+ for (; i < n; ++i) {
+ output[i] = input[i] * inv_rms_f * weight[i];
+ }
+ } else {
+ // without gamma
+ for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+ FP32Vec8 x_vec(input + i);
+ FP32Vec8 scale_vec(inv_rms_f);
+ FP32Vec8 y = x_vec * scale_vec;
+ y.save(output + i);
+ }
+
+ for (; i < n; ++i) {
+ output[i] = input[i] * inv_rms_f;
+ }
+ }
+}
+
+// Prefetch data to cache for better memory access performance
+FORCE_INLINE void prefetch(const void* addr) {
+ __builtin_prefetch(addr, 0, 3); // 0=read, 3=high temporal locality
+}
}; // namespace vec_op
From 2601f18a8216b8e0b8743cded669b8b4d0e3e980 Mon Sep 17 00:00:00 2001
From: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Date: Mon, 24 Nov 2025 22:08:29 +0800
Subject: [PATCH 007/134] [EPLB] Optimize EPLB for Async Rearrange Experts
(#22179)
Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: SunChenxiang123 <1291824390@qq.com>
---
tests/distributed/test_eplb_execute.py | 127 +++++-
tests/distributed/test_eplb_spec_decode.py | 39 +-
vllm/config/parallel.py | 4 +
vllm/distributed/eplb/async_worker.py | 115 ++++++
vllm/distributed/eplb/eplb_state.py | 433 ++++++++++++++++++---
vllm/distributed/eplb/rebalance_execute.py | 137 ++++++-
vllm/v1/worker/gpu_model_runner.py | 2 +
7 files changed, 779 insertions(+), 78 deletions(-)
create mode 100644 vllm/distributed/eplb/async_worker.py
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 9498e75b279b7..781dfd44c1ef6 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,13 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
import random
import pytest
import torch
import torch.distributed
-from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.eplb.rebalance_execute import (
+ move_from_buffer,
+ rearrange_expert_weights_inplace,
+ transfer_layer,
+)
from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized,
get_tp_group,
@@ -231,6 +236,100 @@ def verify_redundant_experts_have_same_weights(
)
+def _test_async_transfer_layer_without_mtp_worker(
+ env,
+ world_size: int,
+ num_layers: int,
+ num_local_experts: int,
+ num_logical_experts: int,
+) -> None:
+ set_env_vars_and_device(env)
+ ensure_model_parallel_initialized(
+ tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+ )
+
+ tp_group = get_tp_group()
+ ep_group = tp_group.device_group
+ ep_rank = torch.distributed.get_rank()
+ device = torch.device(f"cuda:{ep_rank}")
+
+ total_physical_experts = world_size * num_local_experts
+ hidden_sizes = [16, 32]
+
+ redundancy_config = create_redundancy_config(
+ num_logical_experts,
+ total_physical_experts,
+ )
+ old_indices = create_expert_indices_with_redundancy(
+ num_layers,
+ num_logical_experts,
+ total_physical_experts,
+ redundancy_config,
+ )
+
+ new_redundancy_config = create_redundancy_config(
+ num_logical_experts,
+ total_physical_experts,
+ )
+ new_indices = create_expert_indices_with_redundancy(
+ num_layers,
+ num_logical_experts,
+ total_physical_experts,
+ new_redundancy_config,
+ )
+
+ expert_weights = create_expert_weights(
+ num_layers,
+ num_local_experts,
+ hidden_sizes,
+ ep_rank,
+ device,
+ old_indices,
+ )
+
+ expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+ cuda_stream = torch.cuda.Stream(device=device)
+
+ for layer_idx in range(num_layers):
+ is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
+ transfer_layer(
+ old_global_expert_indices=old_indices,
+ new_global_expert_indices=new_indices,
+ expert_weights=expert_weights,
+ expert_weights_buffer=expert_buffer,
+ ep_group=ep_group,
+ layer=layer_idx,
+ cuda_stream=cuda_stream,
+ )
+ )
+
+ cuda_stream.synchronize()
+ move_from_buffer(
+ expert_weights=expert_weights[layer_idx],
+ expert_weights_buffer=expert_buffer,
+ is_unchanged=is_unchanged,
+ is_received_locally=is_received_locally,
+ experts_recv_loc=experts_recv_loc,
+ new_indices=new_indices[layer_idx].tolist(),
+ ep_group=ep_group,
+ )
+
+ verify_expert_weights_after_shuffle(
+ expert_weights,
+ new_indices,
+ hidden_sizes,
+ ep_rank,
+ num_local_experts,
+ )
+ verify_redundant_experts_have_same_weights(
+ expert_weights,
+ new_indices,
+ hidden_sizes,
+ world_size,
+ num_local_experts,
+ )
+
+
def _test_rearrange_expert_weights_with_redundancy(
env, world_size, num_layers, num_local_experts, num_logical_experts
) -> None:
@@ -399,6 +498,32 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
)
+@pytest.mark.parametrize(
+ "world_size,num_layers,num_local_experts,num_logical_experts",
+ [
+ (2, 2, 2, 3),
+ ],
+)
+def test_async_transfer_layer_without_mtp(
+ world_size: int,
+ num_layers: int,
+ num_local_experts: int,
+ num_logical_experts: int,
+):
+ """Exercise async EPLB transfer path without MTP/spec decode."""
+
+ if torch.cuda.device_count() < world_size:
+ pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+ distributed_run(
+ _test_async_transfer_layer_without_mtp_worker,
+ world_size,
+ num_layers,
+ num_local_experts,
+ num_logical_experts,
+ )
+
+
@pytest.mark.parametrize("world_size", [2, 4])
def test_rearrange_expert_weights_no_change(world_size):
"""
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index 11e23f128f331..c055b7a3f6dd7 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -10,10 +10,11 @@ from tests.utils import large_gpu_mark
def get_model_args(
model_name: str,
- spec_model_name: str,
+ spec_model_name: str | None,
spec_method: str,
tp_size: int,
model_max_len: int,
+ use_async: bool = False,
) -> dict:
speculative_config = {
"method": spec_method,
@@ -37,6 +38,8 @@ def get_model_args(
"enable_eplb": True,
"max_model_len": model_max_len,
}
+ if use_async:
+ model_args["eplb_config"] = {"use_async": True}
return model_args
@@ -94,3 +97,37 @@ def test_eplb_spec_decode(
measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
+
+
+@large_gpu_mark(min_gb=80)
+def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
+ """
+ Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
+ """
+
+ TASK = "gsm8k"
+ FILTER = "exact_match,strict-match"
+ RTOL = 0.03
+ expected_gsm8k_value = 0.86
+
+ model_args = get_model_args(
+ model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
+ spec_model_name=None,
+ spec_method="mtp",
+ tp_size=4,
+ model_max_len=4096,
+ use_async=True,
+ )
+
+ results = lm_eval.simple_evaluate(
+ model="vllm",
+ model_args=model_args,
+ tasks=TASK,
+ batch_size=64,
+ num_fewshot=8,
+ )
+ measured_value = results["results"][TASK][FILTER]
+ assert (
+ measured_value - RTOL < expected_gsm8k_value
+ and measured_value + RTOL > expected_gsm8k_value
+ ), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 4b0236d8de3f5..ad438a8b464e0 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -60,6 +60,10 @@ class EPLBConfig:
Log the balancedness each step of expert parallelism.
This is turned off by default since it will cause communication overhead.
"""
+ use_async: bool = False
+ """
+ Whether to use non-blocking EPLB.
+ """
@config
diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
new file mode 100644
index 0000000000000..e4b4fc92eeaaa
--- /dev/null
+++ b/vllm/distributed/eplb/async_worker.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+The async worker that transfers experts in the background.
+"""
+
+import asyncio
+import threading
+from typing import TYPE_CHECKING
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.parallel_state import get_ep_group
+from vllm.logger import init_logger
+
+from .rebalance_execute import transfer_layer
+
+if TYPE_CHECKING:
+ from .eplb_state import EplbState
+
+logger = init_logger(__name__)
+
+
+def start_async_worker(
+ state: "EplbState",
+ rank_mapping: dict[int, int] | None = None,
+ is_profile: bool = False,
+) -> threading.Thread:
+ ep_group = get_ep_group().device_group
+ rank = ep_group.rank()
+ device_index = state.cuda_device_index
+
+ def thread_target() -> None:
+ assert device_index is not None
+ torch.cuda.set_device(device_index)
+ cuda_stream = torch.cuda.Stream(device=device_index)
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ try:
+ loop.run_until_complete(
+ transfer_run_periodically(
+ state=state,
+ ep_group=ep_group,
+ is_profile=is_profile,
+ rank_mapping=rank_mapping,
+ cuda_stream=cuda_stream,
+ )
+ )
+ except Exception as exc: # pragma: no cover - diagnostic path
+ logger.exception("async loop error (Rank %d): %s", rank, str(exc))
+ finally:
+ loop.close()
+
+ thread = threading.Thread(target=thread_target, daemon=True)
+ thread.start()
+ return thread
+
+
+async def transfer_run_periodically(
+ state: "EplbState",
+ ep_group: ProcessGroup,
+ is_profile: bool = False,
+ rank_mapping: dict[int, int] | None = None,
+ cuda_stream: torch.cuda.Stream = None,
+) -> None:
+ while True:
+ await asyncio.to_thread(state.rearrange_event.wait)
+ logger.info("async worker woke up for EPLB transfer")
+
+ for model_state in state.model_states.values():
+ if not model_state.is_async_enabled:
+ continue
+ current_num_layers = model_state.model.num_moe_layers
+ while (
+ model_state.rebalanced
+ and model_state.layer_to_transfer < current_num_layers
+ ):
+ if (
+ not model_state.ep_buffer_ready
+ and model_state.rebalanced
+ and model_state.new_physical_to_logical_map is not None
+ ):
+ await asyncio.to_thread(model_state.buffer_lock.acquire)
+ try:
+ if model_state.layer_to_transfer >= current_num_layers:
+ break
+
+ (
+ model_state.is_unchanged,
+ model_state.is_received_locally,
+ model_state.experts_recv_loc,
+ ) = await transfer_layer(
+ old_global_expert_indices=model_state.physical_to_logical_map,
+ new_global_expert_indices=model_state.new_physical_to_logical_map,
+ expert_weights=model_state.model.expert_weights,
+ expert_weights_buffer=model_state.expert_buffer,
+ ep_group=ep_group,
+ is_profile=is_profile,
+ layer=model_state.layer_to_transfer,
+ cuda_stream=cuda_stream,
+ rank_mapping=rank_mapping,
+ )
+ event = torch.cuda.Event(blocking=False)
+ cuda_stream.record_event(event)
+ model_state.buffer_ready_event = event
+ model_state.ep_buffer_ready = 1
+ finally:
+ model_state.buffer_lock.release()
+ else:
+ if not model_state.rebalanced:
+ break
+ await asyncio.sleep(0.001)
+
+ state.rearrange_event.clear()
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 526d3ceac7b8f..9f8798a96a2fc 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -26,6 +26,7 @@ MoE layer. If we have 32 EP ranks, then each GPU will hold 288 / 32 = 9 local
physical experts.
"""
+import threading
import time
from collections.abc import Sequence
from dataclasses import dataclass
@@ -43,8 +44,9 @@ from vllm.distributed.utils import StatelessProcessGroup
from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import MixtureOfExperts
+from .async_worker import start_async_worker
from .rebalance_algo import rebalance_experts
-from .rebalance_execute import rearrange_expert_weights_inplace
+from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
logger = init_logger(__name__)
@@ -132,6 +134,74 @@ class EplbModelState:
"""
model_name: str
model: MixtureOfExperts
+ expert_buffer: list[torch.Tensor]
+ """
+ The buffer to store the expert weights during transfer.
+ """
+ buffer_lock: threading.Lock
+ """
+ The lock to protect the expert buffer.
+ """
+ buffer_ready_event: torch.cuda.Event | None
+ """
+ CUDA event recorded when the async worker finishes filling the buffer.
+ The main thread waits on this before consuming the buffer.
+ """
+ ep_buffer_ready: int
+ """
+ The flag indicates whether the expert buffer is ready for transfer.
+ 0 or 1.
+ """
+ layer_to_transfer: int
+ """
+ The layer index to transfer in async mode.
+ """
+ rebalanced: bool
+ """
+ The flag indicates whether the experts rebalance have been computed.
+ """
+ pending_global_ready_check: bool
+ """
+ Whether the async EPLB needs to poll peers for buffer readiness.
+ """
+ is_unchanged: list[bool]
+ """
+ intermediate variable between `move_to_buffer` and `move_to_workspace`.
+ The size is same as the num of physical experts in the current layer.
+ """
+ is_received_locally: list[bool]
+ """
+ intermediate variable between `move_to_buffer` and `move_to_workspace`.
+ The size is same as the num of physical experts in the current layer.
+ """
+ experts_recv_loc: dict[int, int]
+ """
+ intermediate variable between `move_to_buffer` and `move_to_workspace`.
+ The size is same as the num of physical experts in the current layer.
+ """
+ is_async_enabled: bool
+ """
+ The flag indicates whether the EPLB is running in async mode.
+ """
+ cuda_device_index: int | None
+ """
+ CUDA device index for the async EPLB worker thread.
+ """
+ new_physical_to_logical_map: torch.Tensor | None = None
+ """
+ intermediate variable between `move_to_buffer` and `move_to_workspace`.
+ the size is same as physical_to_logical_map
+ """
+ new_logical_to_physical_map: torch.Tensor | None = None
+ """
+ intermediate variable between `move_to_buffer` and `move_to_workspace`.
+ the size is same as logical_to_physical_map
+ """
+ new_logical_replica_count: torch.Tensor | None = None
+ """
+ intermediate variable between `move_to_buffer` and `move_to_workspace`.
+ the size is same as logical_replica_count
+ """
class EplbState:
@@ -164,12 +234,31 @@ class EplbState:
Otherwise, the rearrangement will hang at collective
communication calls.
"""
- self.expert_rearrangement_step: int = 0
+ self.expert_rearrangement_step_interval: int = 0
"""
Interval for expert rearrangement steps.
This is a constant and is taken from the config.
"""
- self.expert_rearrangement_step_interval: int = 0
+ self.is_async: bool = False
+ """
+ The flag indicates whether the EPLB is running in async mode.
+ """
+ self.rearrange_event = threading.Event()
+ """
+ Event to signal when a new rearrangement is needed for the async thread.
+ """
+ self.async_worker: threading.Thread | None = None
+ """
+ Background thread handling async transfers.
+ """
+ self.cuda_device_index: int | None = None
+ """
+ CUDA device index for the async EPLB worker thread.
+ """
+ if self.device.type == "cuda":
+ self.cuda_device_index = self.device.index
+ if self.cuda_device_index is None and torch.cuda.is_available():
+ self.cuda_device_index = torch.cuda.current_device()
@staticmethod
def build_initial_global_physical_to_logical_map(
@@ -239,6 +328,8 @@ class EplbState:
Build the initial EPLB state.
"""
self.validate_ep_configuration(model)
+ self.is_async = self.parallel_config.eplb_config.use_async
+
physical_to_logical_map_list = (
EplbState.build_initial_global_physical_to_logical_map(
model.num_routed_experts,
@@ -368,7 +459,12 @@ class EplbState:
physical_to_logical_map = new_physical_to_logical_map.to(self.device)
logical_to_physical_map.copy_(new_logical_to_physical_map)
logical_replica_count.copy_(new_logical_replica_count)
+ else:
+ new_physical_to_logical_map = None
+ new_logical_to_physical_map = None
+
+ new_logical_replica_count = None
model.set_eplb_state(
expert_load_pass,
logical_to_physical_map,
@@ -385,15 +481,33 @@ class EplbState:
)
self.expert_rearrangement_step = 0
- self.model_states[model_config.compute_hash()] = EplbModelState(
- physical_to_logical_map,
- logical_to_physical_map,
- logical_replica_count,
- expert_load_pass,
- expert_load_window,
- model_config.model,
- model,
+ expert_buffer = [torch.empty_like(w) for w in model.expert_weights[0]]
+
+ model_state = EplbModelState(
+ physical_to_logical_map=physical_to_logical_map,
+ logical_to_physical_map=logical_to_physical_map,
+ logical_replica_count=logical_replica_count,
+ expert_load_pass=expert_load_pass,
+ expert_load_window=expert_load_window,
+ model_name=model_config.model,
+ model=model,
+ expert_buffer=expert_buffer,
+ buffer_lock=threading.Lock(),
+ buffer_ready_event=None,
+ ep_buffer_ready=0,
+ layer_to_transfer=0,
+ rebalanced=False,
+ pending_global_ready_check=False,
+ is_unchanged=[],
+ is_received_locally=[],
+ experts_recv_loc={},
+ is_async_enabled=self.is_async,
+ cuda_device_index=self.cuda_device_index,
+ new_physical_to_logical_map=new_physical_to_logical_map,
+ new_logical_to_physical_map=new_logical_to_physical_map,
+ new_logical_replica_count=new_logical_replica_count,
)
+ self.model_states[model_config.compute_hash()] = model_state
def step(
self,
@@ -420,7 +534,7 @@ class EplbState:
- `max_tokens`: The maximum load across ranks.
- `balancedness`: The ratio of average load to maximum load.
"""
-
+ ep_group = get_ep_group().device_group
if is_profile:
self.rearrange(is_profile=True)
return
@@ -488,7 +602,49 @@ class EplbState:
# rearrangement step and perform rearrangement to ensure all ranks are
# performing collective communication.
self.expert_rearrangement_step += 1
+
+ if self.is_async:
+ for eplb_model_state in self.model_states.values():
+ if not eplb_model_state.is_async_enabled:
+ continue
+
+ all_ranks_buffer_ready = False
+ if eplb_model_state.pending_global_ready_check:
+ all_ranks_buffer_ready = self._all_ranks_buffer_ready(
+ eplb_model_state
+ )
+ if (
+ eplb_model_state.is_async_enabled
+ and eplb_model_state.ep_buffer_ready
+ and all_ranks_buffer_ready
+ ):
+ self.move_to_workspace(
+ model_state=eplb_model_state,
+ ep_group=ep_group,
+ is_profile=is_profile,
+ )
+ if (
+ eplb_model_state.layer_to_transfer
+ >= eplb_model_state.model.num_moe_layers
+ ):
+ self.post_eplb(eplb_model_state, is_profile)
+ eplb_model_state.rebalanced = False
+ eplb_model_state.layer_to_transfer = 0
+ eplb_model_state.pending_global_ready_check = False
+ logger.info(
+ "finish async transfer for model %s rank %d layer %d",
+ eplb_model_state.model_name,
+ ep_group.rank(),
+ eplb_model_state.model.num_moe_layers,
+ )
+
if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval:
+ if any(
+ eplb_model_state.is_async_enabled and eplb_model_state.rebalanced
+ for eplb_model_state in self.model_states.values()
+ ):
+ # Still performing asynchronous rearrangement
+ return
self.expert_rearrangement_step = 0
self.rearrange()
@@ -524,7 +680,11 @@ class EplbState:
if is_main_rank:
torch.cuda.synchronize()
time_start = time.perf_counter()
- logger.info("Rearranging experts %s...", "(profile)" if is_profile else "")
+ logger.info(
+ "Rearranging experts %s %s...",
+ "(async mode)" if self.is_async else "sync mode",
+ "(profile)" if is_profile else "",
+ )
if global_expert_loads is None:
# Map the physical expert load to global logical experts
@@ -593,6 +753,7 @@ class EplbState:
model = eplb_model_state.model
num_replicas = model.num_physical_experts
num_groups = model.num_expert_groups
+
if rank_mapping is not None and len(rank_mapping) == ep_group.size():
# NOTE(yongji): scale down, we need to rebalance the experts on
# remaining GPUs, transfer the experts while we haven't shutdown
@@ -608,7 +769,7 @@ class EplbState:
num_gpus = ep_group.size()
if num_gpus % num_nodes != 0:
- self.num_nodes = 1
+ num_nodes = 1
logger.warning_once(
f"num_gpus % num_nodes != 0, "
"not using hierarchical rearrangement algorithm.\n"
@@ -631,60 +792,216 @@ class EplbState:
num_gpus,
)
- # Update expert weights
- rearrange_expert_weights_inplace(
- eplb_model_state.physical_to_logical_map,
- new_physical_to_logical_map,
- eplb_model_state.model.expert_weights,
- ep_group,
- is_profile,
- rank_mapping,
- )
+ if not eplb_model_state.is_async_enabled or is_profile:
+ # Update expert weights
+ rearrange_expert_weights_inplace(
+ eplb_model_state.physical_to_logical_map,
+ new_physical_to_logical_map,
+ eplb_model_state.model.expert_weights,
+ ep_group,
+ is_profile,
+ rank_mapping,
+ )
- if not is_profile:
- if (
- eplb_model_state.physical_to_logical_map.shape[1]
- != new_physical_to_logical_map.shape[1]
- ):
- eplb_model_state.physical_to_logical_map = (
- new_physical_to_logical_map.to(
- eplb_model_state.physical_to_logical_map.device
+ if not is_profile:
+ if (
+ eplb_model_state.physical_to_logical_map.shape[1]
+ != new_physical_to_logical_map.shape[1]
+ ):
+ eplb_model_state.physical_to_logical_map = (
+ new_physical_to_logical_map.to(
+ eplb_model_state.physical_to_logical_map.device
+ )
)
+ else:
+ eplb_model_state.physical_to_logical_map.copy_(
+ new_physical_to_logical_map
+ )
+ max_physical_slots = new_logical_to_physical_map.shape[-1]
+ assert (
+ max_physical_slots
+ <= eplb_model_state.logical_to_physical_map.shape[-1]
)
- else:
- eplb_model_state.physical_to_logical_map.copy_(
- new_physical_to_logical_map
+ new_logical_to_physical_map = torch.nn.functional.pad(
+ new_logical_to_physical_map,
+ (
+ 0,
+ eplb_model_state.logical_to_physical_map.shape[-1]
+ - max_physical_slots,
+ ),
+ value=-1,
)
- max_physical_slots = new_logical_to_physical_map.shape[-1]
- assert (
- max_physical_slots
- <= eplb_model_state.logical_to_physical_map.shape[-1]
- )
- new_logical_to_physical_map = torch.nn.functional.pad(
+ eplb_model_state.logical_to_physical_map.copy_(
+ new_logical_to_physical_map
+ )
+ eplb_model_state.logical_replica_count.copy_(
+ new_logical_replica_count
+ )
+ if is_main_rank:
+ assert time_start is not None
+ torch.cuda.synchronize()
+ time_end = time.perf_counter()
+ logger.info(
+ "Rearranged experts%sin %.2f seconds.",
+ " (profile) " if is_profile else " ",
+ time_end - time_start,
+ )
+ else:
+ device = eplb_model_state.physical_to_logical_map.device
+ new_physical = new_physical_to_logical_map.to(device)
+ max_slots = eplb_model_state.logical_to_physical_map.shape[-1]
+ padded_logical = torch.nn.functional.pad(
new_logical_to_physical_map,
- (
- 0,
- eplb_model_state.logical_to_physical_map.shape[-1]
- - max_physical_slots,
- ),
+ (0, max(0, max_slots - new_logical_to_physical_map.shape[-1])),
value=-1,
+ ).to(eplb_model_state.logical_to_physical_map.device)
+ new_replica = new_logical_replica_count.to(
+ eplb_model_state.logical_replica_count.device
)
- eplb_model_state.logical_to_physical_map.copy_(
- new_logical_to_physical_map
- )
- eplb_model_state.logical_replica_count.copy_(new_logical_replica_count)
- if is_main_rank:
- assert time_start is not None
- torch.cuda.synchronize()
- time_end = time.perf_counter()
- logger.info(
- "Rearranged experts%sin %.2f seconds.",
- " (profile) " if is_profile else " ",
- time_end - time_start,
- )
+ eplb_model_state.new_physical_to_logical_map = new_physical
+ eplb_model_state.new_logical_to_physical_map = padded_logical
+ eplb_model_state.new_logical_replica_count = new_replica
+
+ eplb_model_state.rebalanced = True
+ eplb_model_state.layer_to_transfer = 0
+ eplb_model_state.pending_global_ready_check = True
+
+ # Signal async thread to start transferring layers
+ if self.is_async and (not is_profile):
+ self.rearrange_event.set()
return None
+ def start_async_loop(
+ self,
+ rank_mapping: dict[int, int] | None = None,
+ is_profile: bool = False,
+ ):
+ if not self.is_async:
+ return
+ if self.async_worker is None:
+ self.async_worker = start_async_worker(
+ self,
+ rank_mapping=rank_mapping,
+ is_profile=is_profile,
+ )
+
+ def _update_layer_mapping_from_new(
+ self, model_state: EplbModelState, layer: int
+ ) -> None:
+ if (
+ model_state.new_physical_to_logical_map is None
+ or model_state.new_logical_to_physical_map is None
+ or model_state.new_logical_replica_count is None
+ ):
+ return
+
+ target_device = model_state.physical_to_logical_map.device
+ new_physical = model_state.new_physical_to_logical_map
+ if model_state.physical_to_logical_map.shape[1] != new_physical.shape[1]:
+ model_state.physical_to_logical_map = new_physical.to(target_device)
+ else:
+ model_state.physical_to_logical_map[layer].copy_(
+ new_physical[layer].to(target_device)
+ )
+
+ logical_device = model_state.logical_to_physical_map.device
+ new_logical = model_state.new_logical_to_physical_map[layer].to(logical_device)
+ max_slots = model_state.logical_to_physical_map.shape[-1]
+ slot_delta = max_slots - new_logical.shape[-1]
+ if slot_delta > 0:
+ new_logical = torch.nn.functional.pad(
+ new_logical, (0, slot_delta), value=-1
+ )
+ model_state.logical_to_physical_map[layer].copy_(new_logical)
+
+ replica_device = model_state.logical_replica_count.device
+ model_state.logical_replica_count[layer].copy_(
+ model_state.new_logical_replica_count[layer].to(replica_device)
+ )
+
+ def _all_ranks_buffer_ready(self, model_state: EplbModelState) -> bool:
+ parallel_state = get_ep_group()
+ cpu_group = getattr(parallel_state, "cpu_group", None)
+ if cpu_group is not None and cpu_group.size() > 1:
+ flag = torch.tensor(
+ (int(model_state.ep_buffer_ready),), dtype=torch.int32, device="cpu"
+ )
+ all_reduce(flag, group=cpu_group)
+ return int(flag.item()) == cpu_group.size()
+
+ device_group = parallel_state.device_group
+ if device_group.size() <= 1:
+ return bool(model_state.ep_buffer_ready)
+
+ device = getattr(
+ parallel_state, "device", model_state.physical_to_logical_map.device
+ )
+ flag = torch.tensor(
+ (int(model_state.ep_buffer_ready),), dtype=torch.int32, device=device
+ )
+ all_reduce(flag, group=device_group)
+ return int(flag.item()) == device_group.size()
+
+ def move_to_workspace(
+ self,
+ model_state: EplbModelState,
+ ep_group: ProcessGroup,
+ is_profile: bool = False,
+ ):
+ if not model_state.buffer_lock.acquire(blocking=False):
+ return
+ try:
+ assert model_state.new_physical_to_logical_map is not None
+ device_index = model_state.cuda_device_index or self.cuda_device_index
+ if model_state.buffer_ready_event is not None and device_index is not None:
+ stream = torch.cuda.current_stream(device=device_index)
+ stream.wait_event(model_state.buffer_ready_event)
+ model_state.buffer_ready_event = None
+ move_from_buffer(
+ expert_weights=model_state.model.expert_weights[
+ model_state.layer_to_transfer
+ ],
+ expert_weights_buffer=model_state.expert_buffer,
+ is_unchanged=model_state.is_unchanged,
+ is_received_locally=model_state.is_received_locally,
+ experts_recv_loc=model_state.experts_recv_loc,
+ new_indices=model_state.new_physical_to_logical_map[
+ model_state.layer_to_transfer
+ ].tolist(),
+ ep_group=ep_group,
+ )
+ transferred_layer = model_state.layer_to_transfer
+ self._update_layer_mapping_from_new(model_state, transferred_layer)
+ # After the main thread consumes, advance layer_to_transfer
+ model_state.layer_to_transfer += 1
+ model_state.ep_buffer_ready = 0
+ logger.info(
+ "model %s successfully move_to_workspace layer %d",
+ model_state.model_name,
+ transferred_layer,
+ )
+ finally:
+ try:
+ model_state.buffer_lock.release()
+ except Exception as e:
+ logger.error(
+ "Rank %d: buffer_lock release failed in move_to_workspace: %s",
+ ep_group.rank(),
+ str(e),
+ )
+
+ def post_eplb(self, model_state: EplbModelState, is_profile: bool = False) -> None:
+ assert model_state.new_physical_to_logical_map is not None
+ assert model_state.new_logical_to_physical_map is not None
+ assert model_state.new_logical_replica_count is not None
+ if not is_profile:
+ for layer_idx in range(model_state.physical_to_logical_map.shape[0]):
+ self._update_layer_mapping_from_new(model_state, layer_idx)
+ model_state.new_physical_to_logical_map = None
+ model_state.new_logical_to_physical_map = None
+ model_state.new_logical_replica_count = None
+
@staticmethod
def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
"""
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 5c1efbaf03bab..376dad8a72ef1 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -100,18 +100,19 @@ def get_ep_ranks_with_expert(
return ranks_to_send, ranks_to_recv_actual
-def shuffle_layer(
+def move_to_buffer(
num_local_experts: int,
- ep_rank: int,
old_indices: Sequence[int],
new_indices: Sequence[int],
expert_weights: Iterable[torch.Tensor],
expert_weights_buffer: Sequence[torch.Tensor],
+ cuda_stream: torch.cuda.Stream | None,
ep_group: ProcessGroup,
-) -> None:
+) -> tuple[list[bool], list[bool], dict[int, int]]:
"""
Perform expert weights rearrangement of one layer.
"""
+ ep_rank = ep_group.rank()
local2global = partial(
idx_local_to_global,
local_cnt=num_local_experts,
@@ -137,7 +138,8 @@ def shuffle_layer(
if old_indices[src_global] == new_indices[dst_global]:
is_received_locally[dst] = True
for weight, buffer in zip(expert_weights, expert_weights_buffer):
- buffer[dst].copy_(weight[src])
+ with torch.cuda.stream(cuda_stream):
+ buffer[dst].copy_(weight[src], non_blocking=True)
p2p_ops: list[P2POp] = []
@@ -225,25 +227,115 @@ def shuffle_layer(
]
# 4. Execute the P2P operations. The real communication happens here.
- if p2p_ops:
+ if p2p_ops and cuda_stream is not None:
+ with torch.cuda.stream(cuda_stream):
+ reqs = batch_isend_irecv(p2p_ops)
+ for req in reqs:
+ req.wait()
+ elif p2p_ops:
reqs = batch_isend_irecv(p2p_ops)
for req in reqs:
req.wait()
+ # wait for the communication to finish
+ return is_unchanged, is_received_locally, experts_recv_loc
+
+
+def move_from_buffer(
+ expert_weights: Iterable[torch.Tensor],
+ expert_weights_buffer: list[torch.Tensor],
+ is_unchanged: list[bool],
+ is_received_locally: list[bool],
+ experts_recv_loc: dict[int, int],
+ new_indices: Sequence[int],
+ ep_group: ProcessGroup,
+) -> None:
+ ep_rank = ep_group.rank()
+ num_local_experts = len(is_unchanged)
+
+ local2global = partial(
+ idx_local_to_global, local_cnt=num_local_experts, ep_rank=ep_rank
+ )
- # 5. Copy the weights from the buffer back to the original weights.
for dst in range(num_local_experts):
if is_unchanged[dst]:
continue
if is_received_locally[dst]:
for weight, buffer in zip(expert_weights, expert_weights_buffer):
- weight[dst].copy_(buffer[dst])
+ weight[dst].copy_(buffer[dst], non_blocking=True)
else:
expert = new_indices[local2global(dst)]
if expert == -1:
continue
src = experts_recv_loc[expert]
for weight, buffer in zip(expert_weights, expert_weights_buffer):
- weight[dst].copy_(buffer[src])
+ weight[dst].copy_(buffer[src], non_blocking=True)
+
+
+async def transfer_layer(
+ old_global_expert_indices: torch.Tensor,
+ new_global_expert_indices: torch.Tensor,
+ expert_weights: Sequence[Iterable[torch.Tensor]],
+ expert_weights_buffer: Sequence[torch.Tensor],
+ ep_group: ProcessGroup,
+ is_profile: bool = False,
+ layer: int = 0,
+ cuda_stream: torch.cuda.Stream | None = None,
+ rank_mapping: dict[int, int] | None = None,
+) -> tuple[list[bool], list[bool], dict[int, int]]:
+ """
+ Rearranges the expert weights in place according to the new expert indices.
+
+ The value of the indices arguments are logical indices of the experts,
+ while keys are physical.
+
+ Args:
+ old_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+ new_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+ expert_weights: A sequence of shape (num_moe_layers)(weight_count)
+ of tensors of shape (num_local_physical_experts, hidden_size_i).
+ For example, a linear layer may have up and down projection,
+ so weight_count = 2. Each weight's hidden size can be different.
+ ep_group: The device process group for expert parallelism.
+ is_profile (bool): If `True`, do not perform any actual weight copy.
+ This is used during profile run, where we only perform dummy
+ communications to reserve enough memory for the buffers.
+ """
+ ep_size = ep_group.size()
+ if rank_mapping is not None:
+ if len(rank_mapping) == ep_group.size():
+ # scale down
+ new_global_expert_indices = _map_new_expert_indices_with_rank_mapping(
+ new_global_expert_indices,
+ rank_mapping,
+ )
+ else:
+ # scale up
+ old_global_expert_indices = _map_old_expert_indices_with_rank_mapping(
+ old_global_expert_indices,
+ rank_mapping,
+ ep_group.size(),
+ )
+
+ assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1]
+ num_moe_layers, num_physical_experts = old_global_expert_indices.shape
+ assert len(expert_weights) == num_moe_layers
+ num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
+ assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
+ assert num_physical_experts == ep_size * num_local_physical_experts
+ # A buffer to hold the expert weights in one layer during the exchange.
+ # NOTE: Currently we assume the same weights across different layers
+ # have the same shape.
+
+ is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+ num_local_experts=num_local_physical_experts,
+ old_indices=old_global_expert_indices[layer].tolist(),
+ new_indices=new_global_expert_indices[layer].tolist(),
+ expert_weights=expert_weights[layer],
+ expert_weights_buffer=expert_weights_buffer,
+ cuda_stream=cuda_stream,
+ ep_group=ep_group,
+ )
+ return is_unchanged, is_received_locally, experts_recv_loc
def rearrange_expert_weights_inplace(
@@ -296,7 +388,6 @@ def rearrange_expert_weights_inplace(
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
- ep_rank = ep_group.rank()
ep_size = ep_group.size()
assert num_physical_experts == ep_size * num_local_physical_experts
@@ -329,14 +420,24 @@ def rearrange_expert_weights_inplace(
torch.cuda.synchronize()
for layer in range(num_moe_layers):
- shuffle_layer(
- num_local_physical_experts,
- ep_rank,
- old_global_expert_indices_cpu[layer].tolist(),
- new_global_expert_indices_cpu[layer].tolist(),
- expert_weights[layer],
- expert_weights_buffer,
- ep_group,
+ is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+ num_local_experts=num_local_physical_experts,
+ old_indices=old_global_expert_indices_cpu[layer].tolist(),
+ new_indices=new_global_expert_indices_cpu[layer].tolist(),
+ expert_weights=expert_weights[layer],
+ expert_weights_buffer=expert_weights_buffer,
+ cuda_stream=None,
+ ep_group=ep_group,
+ )
+
+ move_from_buffer(
+ expert_weights=expert_weights[layer],
+ expert_weights_buffer=expert_weights_buffer,
+ is_unchanged=is_unchanged,
+ is_received_locally=is_received_locally,
+ experts_recv_loc=experts_recv_loc,
+ new_indices=new_global_expert_indices[layer].tolist(),
+ ep_group=ep_group,
)
@@ -428,4 +529,4 @@ def _map_new_expert_indices_with_rank_mapping(
return mapped_expert_indices
-__all__ = ["rearrange_expert_weights_inplace"]
+__all__ = ["transfer_layer", "move_from_buffer"]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6a54e02f861e9..cbafc9c993cc2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3370,6 +3370,8 @@ class GPUModelRunner(
old_global_expert_indices,
rank_mapping,
)
+ if self.eplb_state.is_async:
+ self.eplb_state.start_async_loop(rank_mapping=rank_mapping)
if (
self.vllm_config.compilation_config.mode
From f716a153723d4b2e18d01380cfe25d9ac636e2ef Mon Sep 17 00:00:00 2001
From: Yuan Tang
Date: Mon, 24 Nov 2025 09:40:05 -0500
Subject: [PATCH 008/134] Update KServe guide link in documentation (#29258)
Signed-off-by: Yuan Tang
---
docs/deployment/integrations/kserve.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index edf79fca4f93e..37b29aa1a4876 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -2,4 +2,4 @@
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
-Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
+Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
From 7a228b5305f3257834c5079fd475f659bc4bf73d Mon Sep 17 00:00:00 2001
From: Laith Sakka
Date: Mon, 24 Nov 2025 07:12:41 -0800
Subject: [PATCH 009/134] Add option to use unbacked, and backed size obl
dynamic shapes for more sounds compilation. (#26199)
Signed-off-by: Laith Sakka
---
docs/design/debug_vllm_compile.md | 70 ++++++++++++
docs/design/torch_compile.md | 105 +++++++++++++++++-
.../test_dynamic_shapes_compilation.py | 88 +++++++++++++++
vllm/compilation/decorators.py | 63 +++++++++--
vllm/compilation/wrapper.py | 24 +++-
vllm/config/compilation.py | 60 +++++++++-
vllm/model_executor/models/llama.py | 12 +-
vllm/model_executor/models/qwen2.py | 35 +++++-
8 files changed, 442 insertions(+), 15 deletions(-)
create mode 100644 tests/compile/test_dynamic_shapes_compilation.py
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 8912eb58f8ac7..408d2878309dd 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -151,6 +151,76 @@ To avoid this, please either:
2. wrap the branching logic into a custom operator. TorchDynamo does not
trace into custom operators.
+## Debugging constraint violations and dynamic shapes guards issues
+
+Dynamic-shape guards are a specific category of Dynamo guards. They are constraints that `torch.compile`
+attaches to dynamic dimensions (e.g., `seq_len`) to ensure the compiled artifact remains valid.
+These guards typically appear when framework code, custom passes, or user code branches based on
+dynamic shape values.
+
+**Example:**
+
+```python
+if x > 10:
+ # path A
+else:
+ # path B
+```
+
+This creates a guard `x > 10` or `x <= 10` depending on which path was traced.
+
+**vLLM's Assumption:**
+vLLM assumes that all guards added by torch.compile are safe to drop and will not
+constrain the compiled graph to specific input shapes. When this assumption is violated,
+it can cause issues that users need to debug.
+Some side effects that indicates this assumption is violated are runtime errors
+or `ConstraintViolationErrors`.
+
+A `ConstraintViolationErrors` will be thrown if a dynamic shape gets constrained to
+a single value. If you encounter a constraint violation error or suspect that a dynamic
+shapes guard is being added incorrectly, you can use stricter dynamic shape modes to
+help debug the issue:
+
+```sh
+# Online - using unbacked mode
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+
+# Online - using backed_size_oblivious mode
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
+```
+
+```py
+# Offline - using unbacked mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+ dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.UNBACKED)
+))
+
+# Offline - using backed_size_oblivious mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+ dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS)
+))
+```
+
+These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
+
+- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
+- `backed_size_oblivious`: Uses a mode that is more strict about guarding.
+
+For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
+
+### Printing guards
+
+To see all guards that are being added during compilation, you can use `TORCH_LOGS=+dynamic`:
+
+```sh
+TORCH_LOGS=+dynamic vllm serve meta-llama/Llama-3.2-1B
+```
+
+Look for `[guard added]` in the logs to see where guards are being added. This can help you identify which operations are
+causing guards to be added incorrectly.
+
## Debugging TorchInductor
TorchInductor takes a captured graph and then compiles it down to some Python code
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 27edc4f89201d..7b0b2c1e96978 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -29,6 +29,109 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
+## Dynamic shapes and vllm guard dropping
+
+`torch.compile` is designed to guard on dynamic shapes with no hesitation
+when needed. This contradicts with vLLM's `torch.compile` approach of
+dropping the guards since many of those guards could be material.
+
+`torch.compile` provides two kinds of dynamic shapes: `backed` and `unbacked`.
+`torch.compile` guards on `backed` dynamic shapes and does not provide a
+guarantee that no guards will be added to them. User code, dynamo,
+inductor, and autograd all can add guards. Moreover, for 0/1
+specializations, backed symbols are specialized unconditionally to 0, 1,
+or >=2 even without encountering a branching on those ranges.
+
+On the contrary, `unbacked` dynamic shapes are guaranteed not to be guarded
+on and are not 0/1 specialized. However, there is a possibility of
+throwing a data dependent error when a branch that requires their value is
+encountered and no explicit unbacked handling is defined. The framework is
+converging to a state where it won't throw DDE but rather pick general
+paths. One downside of using unbacked is missed optimization opportunities
+due to either perf bugs or picking general paths, also using a fixed
+non-example input-based hint (this will be fixed soon with override_hint
+API). An example of picking general paths is assuming input not contiguous
+in functions call contiguous() and reshape() when can't be symbolically proven
+with a change of introducing a clone.
+
+`backed_size_oblivious` is a flag that enables treating backed symbols as
+unbacked wherever explicit handling for unbacked is defined. With this
+mode, 0/1 specializations are mostly avoided in framework code and the
+default 0/1 specialization does not happen. However, there is still no
+guarantee that torch.compile won't guard, especially due to user code or
+custom passes. `backed_size_oblivious` is experimental in PyTorch compile
+and could be deprecated. That said, it's a safer option to use than
+`backed` and the probability of reducing performance is lower than
+`unbacked`.
+
+### Configuring Dynamic Shapes
+
+The `DynamicShapesConfig` allows you to control the dynamic shapes behavior by
+setting the `type` field. You can choose between three modes:
+`BACKED`(default), `UNBACKED` , and `BACKED_SIZE_OBLIVIOUS`.
+
+#### Offline Inference Example (Using LLM class)
+
+When using the `LLM` class for offline inference, you can configure dynamic
+shapes through the `compilation_config` parameter:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+
+# Example: Using backed_size_oblivious (experimental, safer than backed)
+llm = LLM(
+ model="meta-llama/Llama-3.2-1B",
+ compilation_config=CompilationConfig(
+ dynamic_shapes_config=DynamicShapesConfig(
+ type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS
+ )
+ )
+)
+
+# Example: Using unbacked (strongest guarantee against guards)
+llm = LLM(
+ model="meta-llama/Llama-3.2-1B",
+ compilation_config=CompilationConfig(
+ dynamic_shapes_config=DynamicShapesConfig(
+ type=DynamicShapesType.UNBACKED
+ )
+ )
+)
+
+# Generate outputs
+prompts = ["Hello, my name is", "The future of AI is"]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+outputs = llm.generate(prompts, sampling_params)
+```
+
+#### Online Serving Example (Using vllm serve)
+
+When using `vllm serve` for online serving, you can configure dynamic shapes
+through the `--compilation-config` flag:
+
+```bash
+# Example: Using unbacked
+vllm serve meta-llama/Llama-3.2-1B \
+ --compilation-config '{"dynamic_shapes_config": {"type": "unbacked"}}'
+
+
+# Alternative: Using dot notation (simpler for single values)
+vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
+```
+
+#### Choosing the Right Mode
+
+- **BACKED** (default): Use when you're willing to accept potential unsafe dropping of guards
+for maximal performance. Guard could be unsoundly added and then ignored.
+
+- **UNBACKED** Use when you need the strongest guarantee against guards.
+ This is the most conservative option but may miss some optimization opportunities.
+
+- **BACKED_SIZE_OBLIVIOUS**: Use when you want a balance between avoiding guards
+ and performance. This experimental mode is safer than BACKED but still not as
+ conservative as UNBACKED.
+
## Python Code Compilation
In the very verbose logs, we can see:
@@ -122,7 +225,7 @@ When all the shapes are known, `torch.compile` can compare different configs, an
triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
- mm 0.0160 ms 81.6%
+ mm 0.0160 ms 81.6%
triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
new file mode 100644
index 0000000000000..c20aea822fe81
--- /dev/null
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gc
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationMode, DynamicShapesType
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+
+def get_test_models():
+ """Get list of models to test based on PyTorch version"""
+ # TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
+ return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
+
+
+@pytest.mark.parametrize("model_name", get_test_models())
+@pytest.mark.parametrize(
+ "shapes_type",
+ [
+ DynamicShapesType.BACKED,
+ DynamicShapesType.UNBACKED,
+ DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
+ ],
+)
+@pytest.mark.parametrize("use_aot_compile", ["0"])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@pytest.mark.skipif(
+ not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_dynamic_shapes_compilation(
+ monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
+):
+ """Test that all dynamic shapes types compile successfully"""
+ print(
+ f"\nTesting model: {model_name} with {shapes_type.name}, "
+ f"AOT compile: {use_aot_compile}, "
+ f"Bytecode hook: {use_bytecode_hook}"
+ )
+ if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
+ pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
+
+ monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
+ monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+ prompt = "Hello, my name is"
+
+ print(f"Testing {shapes_type.name} dynamic shapes...")
+
+ # Initialize the model with specific dynamic shapes configuration
+ model = LLM(
+ model=model_name,
+ compilation_config={
+ "mode": CompilationMode.VLLM_COMPILE,
+ "dynamic_shapes_config": {
+ "type": shapes_type.value,
+ },
+ },
+ )
+
+ output = model.generate(prompt)
+ result = output[0].outputs[0].text
+ # Example of setting the sampling parameters
+ tokenizer = get_tokenizer(model_name)
+ yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
+ no_tokens = tokenizer.encode("no", add_special_tokens=False)
+ allowed_ids = list(set(yes_tokens + no_tokens))
+ sampling_params = SamplingParams(
+ max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
+ )
+
+ output = model.generate(
+ "answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
+ sampling_params=sampling_params,
+ )
+ result = output[0].outputs[0].text
+ assert result == "yes"
+
+ # Clean up GPU memory
+ del model
+ gc.collect()
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+ print("GPU memory cleared")
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 11a18c0e6bb78..6d9da1c488c6d 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -24,6 +24,7 @@ from vllm.config import (
get_current_vllm_config,
set_current_vllm_config,
)
+from vllm.config.compilation import DynamicShapesType
from vllm.logger import init_logger
from vllm.sequence import IntermediateTensors
from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -104,6 +105,7 @@ def support_torch_compile(
dynamic_arg_dims: dict[str, int | list[int]] | None = None,
mark_unbacked_dims: dict[str, int | list[int]] | None = None,
enable_if: Callable[[VllmConfig], bool] | None = None,
+ shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
) -> Callable[[_T], _T] | _T:
"""
A decorator to add support for compiling the forward method of a class.
@@ -161,6 +163,14 @@ def support_torch_compile(
dim to be decorated with `mark_unbacked`. This is useful if we would like to
enforce that dynamo does not specialize on 0/1 values in the case of dummy input
such as for vision model compilation
+
+ `shape_invariants` is a function that gets compiled right before forward.
+ The function should have the torch._check calls that are needed to set
+ the relationships between different input sizes. For example:
+ torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+ This enforces constraints on the symbolic shapes without hardcoding
+ specific values. It is needed for some models to avoid data dependent
+ errors.
"""
def cls_decorator_helper(cls: _T) -> _T:
@@ -199,7 +209,11 @@ def support_torch_compile(
f"Argument {k} not found in the forward method of {cls}"
)
return _support_torch_compile(
- cls, inferred_dynamic_arg_dims, mark_unbacked_dims, enable_if
+ cls,
+ inferred_dynamic_arg_dims,
+ mark_unbacked_dims,
+ enable_if,
+ shape_invariants,
)
if cls is not None:
@@ -242,6 +256,7 @@ def _support_torch_compile(
dynamic_arg_dims: dict[str, int | list[int]],
mark_unbacked_dims: dict[str, int | list[int]] | None = None,
enable_if: Callable[[VllmConfig], bool] | None = None,
+ shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
) -> _T:
"""
A decorator to add support for compiling the forward method of a class.
@@ -276,11 +291,12 @@ def _support_torch_compile(
old_init(self, **kwargs)
self.vllm_config = vllm_config
+ self.compilation_config = self.vllm_config.compilation_config
enable_compile = enable_if is None or enable_if(vllm_config)
# for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner
# will handle the compilation, so we don't need to do anything here.
self.do_not_compile = (
- vllm_config.compilation_config.mode
+ self.compilation_config.mode
in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE]
or not supports_dynamo()
or _should_ignore_torch_compile(self.__class__)
@@ -289,29 +305,38 @@ def _support_torch_compile(
if self.do_not_compile:
return
+ self._check_shape_invariants = shape_invariants
+
compilation_counter.num_models_seen += 1
self.compiled = False
TorchCompileWithNoGuardsWrapper.__init__(self)
cls.__init__ = __init__
- def _mark_dynamic_inputs(mod, *args, **kwargs):
+ def _mark_dynamic_inputs(mod, type, *args, **kwargs):
+ def mark_dynamic(arg, dims):
+ if type == DynamicShapesType.UNBACKED:
+ torch._dynamo.decorators.mark_unbacked(arg, dims)
+ else:
+ torch._dynamo.mark_dynamic(arg, dims)
+
sig = inspect.signature(mod.__class__.forward)
bound_args = sig.bind(mod, *args, **kwargs)
bound_args.apply_defaults()
for k, dims in dynamic_arg_dims.items():
arg = bound_args.arguments.get(k)
+
if arg is not None:
dims = [dims] if isinstance(dims, int) else dims
if isinstance(arg, torch.Tensor):
# In case dims is specified with negative indexing
dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
- torch._dynamo.mark_dynamic(arg, dims)
+ mark_dynamic(arg, dims)
elif isinstance(arg, IntermediateTensors):
for tensor in arg.tensors.values():
# In case dims is specified with negative indexing
dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
- torch._dynamo.mark_dynamic(tensor, dims)
+ mark_dynamic(tensor, dims)
else:
raise ValueError(
"Unsupported dynamic dimensions"
@@ -338,6 +363,7 @@ def _support_torch_compile(
if getattr(self, "aot_compiled_fn", None) is not None:
return self.aot_compiled_fn(self, *args, **kwargs)
+ ds_type = self.compilation_config.dynamic_shapes_config.type
cache_dir = None
aot_compilation_path = None
if envs.VLLM_USE_AOT_COMPILE:
@@ -352,6 +378,14 @@ def _support_torch_compile(
serialized backend artifacts), then we need to generate a new AOT
compile artifact from scratch.
"""
+ # Validate that AOT compile is not used with unbacked dynamic
+ # shapes. aot_compile re-allocates backed symbols post dynamo!
+ if ds_type == DynamicShapesType.UNBACKED:
+ raise ValueError(
+ "AOT compilation is not compatible with UNBACKED dynamic shapes. "
+ "Please use BACKED or BACKED_SIZE_OBLIVIOUS dynamic shapes type "
+ "when VLLM_USE_AOT_COMPILE is enabled."
+ )
from .caching import compilation_config_hash_factors
factors: list[str] = compilation_config_hash_factors(self.vllm_config)
@@ -401,7 +435,12 @@ def _support_torch_compile(
# This is the path for the first compilation.
# the first compilation needs to have dynamic shapes marked
- _mark_dynamic_inputs(self, *args, **kwargs)
+ _mark_dynamic_inputs(
+ self,
+ ds_type,
+ *args,
+ **kwargs,
+ )
# here, it is the starting point of the `torch.compile` process
start_monitoring_torch_compile(self.vllm_config)
@@ -417,9 +456,7 @@ def _support_torch_compile(
# properly when any of these files change.
# 1. the file containing the top-level forward function
- self.vllm_config.compilation_config.traced_files.add(
- original_code_object.co_filename
- )
+ self.compilation_config.traced_files.add(original_code_object.co_filename)
# 2. every time Dynamo sees a function call, it will inline
# the function by calling InliningInstructionTranslator.inline_call_
@@ -429,7 +466,7 @@ def _support_torch_compile(
def patched_inline_call(self_):
code = self_.f_code
- self.vllm_config.compilation_config.traced_files.add(code.co_filename)
+ self.compilation_config.traced_files.add(code.co_filename)
return inline_call(self_)
# Disable the C++ compilation of symbolic shape guards. C++-fication
@@ -445,12 +482,18 @@ def _support_torch_compile(
# if the config doesn't exist
logger.debug("enable_cpp_symbolic_shape_guards config not available")
+ # Prepare backed_size_oblivious config patch if needed
+ fx_config_patches = {}
+ if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
+ fx_config_patches["backed_size_oblivious"] = True
+
with (
patch.object(
InliningInstructionTranslator, "inline_call_", patched_inline_call
),
torch._dynamo.config.patch(**dynamo_config_patches),
maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+ torch.fx.experimental._config.patch(**fx_config_patches),
_torch27_patch_tensor_subclasses(),
):
if envs.VLLM_USE_AOT_COMPILE:
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 493e57f97f0f4..b120c85bf232e 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -6,6 +6,7 @@ import sys
from abc import abstractmethod
from contextlib import contextmanager
from types import CodeType
+from typing import Any
import torch
import torch._C._dynamo.guards
@@ -85,6 +86,12 @@ class TorchCompileWithNoGuardsWrapper:
since we drop all guards.
"""
+ def check_invariants_and_forward(self, *args, **kwargs):
+ assert hasattr(self, "_check_shape_invariants")
+ self._check_shape_invariants(*args, **kwargs)
+
+ return self.forward(*args, **kwargs)
+
def __init__(self):
self.compiled = False
@@ -104,6 +111,21 @@ class TorchCompileWithNoGuardsWrapper:
# Drop all the guards.
options["guard_filter_fn"] = lambda x: [False for _ in x]
+ # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
+ from vllm.compilation.decorators import DynamicShapesType
+
+ ds_type = vllm_config.compilation_config.dynamic_shapes_config.type
+ compiled_ptr: Any = self.forward
+ if ds_type == DynamicShapesType.UNBACKED:
+ if envs.VLLM_USE_BYTECODE_HOOK:
+ # reason is that bytecode does this hack torch._dynamo.eval_frame.
+ # remove_from_cache(self.original_code_object()) to force a new
+ # re-compilation.
+ raise ValueError(
+ "UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0. "
+ )
+ compiled_ptr = self.check_invariants_and_forward
+
if envs.VLLM_USE_AOT_COMPILE:
if hasattr(torch._dynamo.config, "enable_aot_compile"):
torch._dynamo.config.enable_aot_compile = True
@@ -114,7 +136,7 @@ class TorchCompileWithNoGuardsWrapper:
logger.warning(msg)
self._compiled_callable = torch.compile(
- self.forward,
+ compiled_ptr,
fullgraph=True,
dynamic=False,
backend=backend,
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 9b5309598d0e2..42eccf9f41123 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -192,6 +192,54 @@ class PassConfig:
self.enable_qk_norm_rope_fusion = False
+class DynamicShapesType(str, enum.Enum):
+ """Types of dynamic shapes handling in torch.compile().
+ see Dynamic shapes and vllm guard dropping in torch_compile.md
+ for more details."""
+
+ BACKED = "backed"
+ """Use backed dynamic shapes. torch.compile() guards on backed dynamic
+ shapes and may add guards. Symbols are specialized to 0, 1, or >=2 even
+ without encountering branching on those ranges."""
+
+ UNBACKED = "unbacked"
+ """Use unbacked dynamic shapes. Guaranteed not to be guarded on and not
+ 0/1 specialized, but may throw data dependent errors when branches require
+ their value without explicit unbacked handling."""
+
+ BACKED_SIZE_OBLIVIOUS = "backed_size_oblivious"
+ """Experimental flag that treats backed symbols as unbacked when explicit
+ unbacked handling is defined."""
+
+
+@config
+@dataclass
+class DynamicShapesConfig:
+ """Configuration to control/debug torch compile dynamic shapes."""
+
+ type: DynamicShapesType = DynamicShapesType.BACKED
+ """Controls the type of dynamic shapes handling to use with torch.compile().
+
+ - BACKED: Default PyTorch behavior with potential guards ignored.
+ - UNBACKED: No guards guaranteed (most sound) but may throw
+ data dependent errors.
+ - BACKED_SIZE_OBLIVIOUS: Experimental safer alternative to
+ backed/unbacked.
+ """
+
+ # TODO add a debug mode to fail
+
+ def compute_hash(self) -> str:
+ """
+ Provide a hash for DynamicShapesConfig
+ """
+
+ from vllm.config.utils import get_hash_factors, hash_factors
+
+ factors = get_hash_factors(self, {})
+ return hash_factors(factors)
+
+
@config
@dataclass
class CompilationConfig:
@@ -322,7 +370,7 @@ class CompilationConfig:
If empty list [], no ops are excluded (suitable for full cudagraphs)."""
compile_mm_encoder: bool = False
"""Whether or not to compile the multimodal encoder.
- Currently, this only works for `Qwen2_5_vl` on selected platforms.
+ Currently, this only works for `Qwen2_5_vl` on selected platforms.
Disabled by default until more models are supported/tested to work."""
# Inductor capture
@@ -348,9 +396,11 @@ class CompilationConfig:
"""Sizes to compile for inductor. In addition
to integers, it also supports "cudagraph_capture_sizes" to
specify the sizes for cudagraph capture."""
+
inductor_compile_config: dict = field(default_factory=dict)
"""Additional configurations for inductor.
- None: use default configurations."""
+
inductor_passes: dict[str, str] = field(default_factory=dict)
"""Additional passes for inductor. It is a dictionary
from pass name to pass function qualified name. We use function
@@ -460,8 +510,15 @@ class CompilationConfig:
max_num_seqs, and prevents capture of many large graphs (>512) that would
greatly increase startup time with limited performance benefit.
"""
+
+ dynamic_shapes_config: DynamicShapesConfig = field(
+ default_factory=DynamicShapesConfig
+ )
+ """Configuration for dynamic shapes options"""
+
local_cache_dir: str = field(default=None, init=False) # type: ignore
"""local cache dir for each rank"""
+
bs_to_padded_graph_size: list[int] = field(
default=None, # type: ignore
init=False,
@@ -530,6 +587,7 @@ class CompilationConfig:
from vllm.config.utils import get_hash_factors, hash_factors
factors = get_hash_factors(self, ignored_factors)
+
factors["pass_config"] = self.pass_config.compute_hash()
return hash_factors(factors)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index ebf8addda4a54..eebb9e07fa89d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -354,7 +354,17 @@ class LlamaDecoderLayer(nn.Module):
return vllm_config.quant_config
-@support_torch_compile
+def llama_model_invariants(
+ input_ids, positions, intermediate_tensors=None, inputs_embeds=None
+):
+ """Shape invariants for Llama model compilation, those are translated to
+ runtime assertions for unbacked dynamic shapes and are compiled away for
+ backed"""
+ if input_ids is not None:
+ torch._check(positions.size()[0] == input_ids.size()[0])
+
+
+@support_torch_compile(shape_invariants=llama_model_invariants)
class LlamaModel(nn.Module):
def __init__(
self,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 32b6d6dd07b83..5831ce0b3d64b 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -274,6 +274,38 @@ class Qwen2DecoderLayer(nn.Module):
return hidden_states, residual
+def qwen_2_model_invariants(
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ intermediate_tensors: IntermediateTensors | None = None,
+ inputs_embeds: torch.Tensor | None = None,
+):
+ """Shape invariants for Qwen2Model Model, those are translated to
+ runtime assertions for unbacked dynamic shapes and are compiled away for
+ backed"""
+ # All these should be equal.
+ # input_ids.size()[0]
+ # positions.size()[-1]
+ # intermediate_tensors["hidden_states"].size()[0]
+ # inputs_embeds.size()[0]
+ torch._check(input_ids.size()[0] == positions.size()[-1])
+ if intermediate_tensors is not None:
+ torch._check(
+ input_ids.size()[0] == intermediate_tensors["hidden_states"].size()[0]
+ )
+
+ if inputs_embeds is not None:
+ torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+
+ # Hidden dimensions should match (hidden_size)
+ # intermediate_tensors["hidden_states"].size()[1]
+ # inputs_embeds.size()[1]
+ if inputs_embeds is not None and intermediate_tensors is not None:
+ torch._check(
+ inputs_embeds.size()[1] == intermediate_tensors["hidden_states"].size()[1]
+ )
+
+
@support_torch_compile(
dynamic_arg_dims={
"input_ids": 0,
@@ -282,7 +314,8 @@ class Qwen2DecoderLayer(nn.Module):
"positions": -1,
"intermediate_tensors": 0,
"inputs_embeds": 0,
- }
+ },
+ shape_invariants=qwen_2_model_invariants,
)
class Qwen2Model(nn.Module):
def __init__(
From e48b2e6848ac0084860c8a6bd73927d7535a2c61 Mon Sep 17 00:00:00 2001
From: vllmellm
Date: Mon, 24 Nov 2025 22:24:49 +0700
Subject: [PATCH 010/134] [Bugfix] [ROCm] [UX] Reorganize ROCm Backend
Selection Logic (#26980)
Signed-off-by: vllmellm
---
.../test_rocm_attention_backends_selection.py | 337 ++++++++++++++++++
vllm/platforms/rocm.py | 80 +++--
2 files changed, 394 insertions(+), 23 deletions(-)
create mode 100644 tests/v1/attention/test_rocm_attention_backends_selection.py
diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
new file mode 100644
index 0000000000000..4ec79e9eb6ba4
--- /dev/null
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -0,0 +1,337 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for attention backend selectors."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.platforms import current_platform
+
+# ROCm-specific attention backend selection tests
+pytestmark = pytest.mark.skipif(
+ not current_platform.is_rocm(), reason="ROCm-specific tests"
+)
+
+
+@pytest.fixture
+def mock_vllm_config():
+ """Create a mock VllmConfig for testing."""
+ config = MagicMock()
+ config.model_config.dtype = torch.float16
+ config.model_config.hf_config.architectures = ["LlamaForCausalLM"]
+ config.cache_config.block_size = 16
+ return config
+
+
+@pytest.fixture
+def mock_on_gfx9():
+ """Mock the on_gfx9 function to return True."""
+ with patch("vllm.platforms.rocm.on_gfx9", return_value=True):
+ yield
+
+
+@pytest.mark.parametrize(
+ "env_vars, selected_backend, expected_backend_path",
+ [
+ # Test Case 1: Default (no env vars, no explicit backend)
+ (
+ {},
+ None,
+ AttentionBackendEnum.TRITON_ATTN.get_path(),
+ ),
+ # Test Case 2: Explicit TRITON_ATTN backend
+ (
+ {},
+ "TRITON_ATTN",
+ AttentionBackendEnum.TRITON_ATTN.get_path(),
+ ),
+ # Test Case 3: Explicit ROCM_ATTN backend
+ (
+ {},
+ "ROCM_ATTN",
+ AttentionBackendEnum.ROCM_ATTN.get_path(),
+ ),
+ # Test Case 4: Explicit ROCM_AITER_FA backend
+ (
+ {},
+ "ROCM_AITER_FA",
+ AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+ ),
+ # Test Case 5: Explicit ROCM_AITER_UNIFIED_ATTN backend
+ (
+ {},
+ "ROCM_AITER_UNIFIED_ATTN",
+ AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+ ),
+ # Test Case 6: VLLM_ROCM_USE_AITER=1
+ # (defaults to AITER FA when MHA not explicitly disabled)
+ (
+ {"VLLM_ROCM_USE_AITER": "1"},
+ None,
+ AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+ ),
+ # Test Case 7: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=1
+ (
+ {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "1"},
+ None,
+ AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+ ),
+ # Test Case 8: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
+ (
+ {
+ "VLLM_ROCM_USE_AITER": "1",
+ "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": "1",
+ },
+ None,
+ AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+ ),
+ # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
+ (
+ {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+ None,
+ AttentionBackendEnum.ROCM_ATTN.get_path(),
+ ),
+ # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+ (
+ {"VLLM_ROCM_USE_AITER": "1"},
+ "TRITON_ATTN",
+ AttentionBackendEnum.TRITON_ATTN.get_path(),
+ ),
+ # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+ # (explicitly disabled)
+ (
+ {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
+ None,
+ AttentionBackendEnum.TRITON_ATTN.get_path(),
+ ),
+ # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+ (
+ {"VLLM_ROCM_USE_AITER": "1"},
+ "ROCM_ATTN",
+ AttentionBackendEnum.ROCM_ATTN.get_path(),
+ ),
+ ],
+)
+def test_standard_attention_backend_selection(
+ env_vars,
+ selected_backend,
+ expected_backend_path,
+ mock_vllm_config,
+ mock_on_gfx9,
+ monkeypatch,
+):
+ """Test standard attention backend selection with various configurations."""
+ # Set environment variables
+ for key, value in env_vars.items():
+ monkeypatch.setenv(key, value)
+
+ # Import after setting env vars to ensure they're picked up
+ # Reload envs to pick up new environment variables
+ import importlib
+
+ import vllm.envs as envs
+ from vllm.attention.backends.registry import _Backend
+
+ importlib.reload(envs)
+
+ # Convert string backend to enum if provided
+ backend_enum = None
+ if selected_backend:
+ backend_enum = getattr(_Backend, selected_backend)
+
+ # Get the backend class path
+ from vllm.platforms.rocm import RocmPlatform
+
+ backend_path = RocmPlatform.get_attn_backend_cls(
+ selected_backend=backend_enum,
+ head_size=128,
+ dtype=torch.float16,
+ kv_cache_dtype="auto",
+ block_size=16,
+ use_mla=False,
+ has_sink=False,
+ use_sparse=False,
+ )
+ assert backend_path == expected_backend_path
+
+
+@pytest.mark.parametrize(
+ "env_vars, selected_backend, block_size, expected_backend_path, should_raise",
+ [
+ # Test Case 1: TRITON_MLA with block_size != 1
+ (
+ {},
+ "TRITON_MLA",
+ 16,
+ AttentionBackendEnum.TRITON_MLA.get_path(),
+ False,
+ ),
+ # Test Case 2: TRITON_MLA with block_size == 1 (should raise)
+ (
+ {},
+ "TRITON_MLA",
+ 1,
+ None,
+ True,
+ ),
+ # Test Case 3: ROCM_AITER_MLA with block_size == 1
+ (
+ {},
+ "ROCM_AITER_MLA",
+ 1,
+ AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+ False,
+ ),
+ # Test Case 4: ROCM_AITER_MLA with block_size != 1 (should raise)
+ (
+ {},
+ "ROCM_AITER_MLA",
+ 16,
+ AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+ False,
+ ),
+ # Test Case 5: VLLM_ROCM_USE_AITER=1 with block_size == 1
+ (
+ {"VLLM_ROCM_USE_AITER": "1"},
+ None,
+ 1,
+ AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+ False,
+ ),
+ # Test Case 6: VLLM_ROCM_USE_AITER=1 with block_size == 16
+ # (should use ROCM_AITER_MLA now, as it supports block_size 16)
+ (
+ {"VLLM_ROCM_USE_AITER": "1"},
+ None,
+ 16,
+ AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+ False,
+ ),
+ # Test Case 7: VLLM_ROCM_USE_AITER=1 + explicit TRITON_MLA
+ (
+ {"VLLM_ROCM_USE_AITER": "1"},
+ "TRITON_MLA",
+ 16,
+ AttentionBackendEnum.TRITON_MLA.get_path(),
+ False,
+ ),
+ # Test Case 8: Explicit ROCM_AITER_TRITON_MLA
+ (
+ {},
+ "ROCM_AITER_TRITON_MLA",
+ 16,
+ AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path(),
+ False,
+ ),
+ ],
+)
+def test_mla_backend_selection(
+ env_vars,
+ selected_backend,
+ block_size,
+ expected_backend_path,
+ should_raise,
+ mock_vllm_config,
+ monkeypatch,
+):
+ """Test MLA backend selection with various configurations."""
+ # Set environment variables
+ for key, value in env_vars.items():
+ monkeypatch.setenv(key, value)
+
+ # Import after setting env vars
+ # Reload envs
+ import importlib
+
+ import vllm.envs as envs
+ from vllm.attention.backends.registry import _Backend
+
+ importlib.reload(envs)
+
+ # Mock is_aiter_mla_enabled based on env vars and block_size
+ aiter_enabled = env_vars.get("VLLM_ROCM_USE_AITER") == "1"
+
+ mock_rocm_ops = MagicMock()
+ mock_rocm_ops.is_mla_enabled.return_value = aiter_enabled
+ mock_aiter_module = MagicMock()
+ mock_aiter_module.rocm_aiter_ops = mock_rocm_ops
+
+ with patch.dict("sys.modules", {"vllm._aiter_ops": mock_aiter_module}):
+ # Convert string backend to enum if provided
+ backend_enum = None
+ if selected_backend:
+ backend_enum = getattr(_Backend, selected_backend)
+
+ from vllm.platforms.rocm import RocmPlatform
+
+ if should_raise:
+ with pytest.raises(ValueError):
+ RocmPlatform.get_attn_backend_cls(
+ selected_backend=backend_enum,
+ head_size=128,
+ dtype=torch.float16,
+ kv_cache_dtype="auto",
+ block_size=block_size,
+ use_mla=True,
+ has_sink=False,
+ use_sparse=False,
+ )
+ else:
+ backend_path = RocmPlatform.get_attn_backend_cls(
+ selected_backend=backend_enum,
+ head_size=128,
+ dtype=torch.float16,
+ kv_cache_dtype="auto",
+ block_size=block_size,
+ use_mla=True,
+ has_sink=False,
+ use_sparse=False,
+ )
+ assert backend_path == expected_backend_path
+
+
+def test_aiter_fa_requires_gfx9(mock_vllm_config):
+ """Test that ROCM_AITER_FA requires gfx9 architecture."""
+ from vllm.attention.backends.registry import _Backend
+ from vllm.platforms.rocm import RocmPlatform
+
+ # Mock on_gfx9 to return False
+ with (
+ patch("vllm.platforms.rocm.on_gfx9", return_value=False),
+ pytest.raises(
+ ValueError,
+ match="only supported on gfx9",
+ ),
+ ):
+ RocmPlatform.get_attn_backend_cls(
+ selected_backend=_Backend.ROCM_AITER_FA,
+ head_size=128,
+ dtype=torch.float16,
+ kv_cache_dtype="auto",
+ block_size=16,
+ use_mla=False,
+ has_sink=False,
+ use_sparse=False,
+ )
+
+
+def test_sparse_not_supported(mock_vllm_config):
+ """Test that sparse attention is not supported on ROCm."""
+ from vllm.platforms.rocm import RocmPlatform
+
+ with pytest.raises(
+ AssertionError, match="Sparse MLA backend on ROCm only supports block size 1"
+ ):
+ RocmPlatform.get_attn_backend_cls(
+ selected_backend=None,
+ head_size=128,
+ dtype=torch.float16,
+ kv_cache_dtype="auto",
+ block_size=16,
+ use_mla=False,
+ has_sink=False,
+ use_sparse=True,
+ )
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index f9005fd7d044c..f3ec965bd0881 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -262,30 +262,64 @@ class RocmPlatform(Platform):
f"is not MLA type while requested for MLA backend."
)
- if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
- logger.info("Using FlexAttention backend.")
- return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
- if (
- rocm_aiter_ops.is_mha_enabled()
- ) or selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
- logger.info("Using Aiter Flash Attention backend.")
- return AttentionBackendEnum.ROCM_AITER_FA.get_path()
- if (
- rocm_aiter_ops.is_triton_unified_attn_enabled()
- ) or selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
- logger.info("Using Aiter Unified Attention backend.")
- return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
- if (
- envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
- or selected_backend == AttentionBackendEnum.ROCM_ATTN
- ):
- # rocm specific backend, with aiter and/or
- # triton prefix-prefill
- logger.info("Using Rocm Attention backend.")
+ if selected_backend == AttentionBackendEnum.TRITON_ATTN:
+ logger.info("Using Triton Attention backend on V1 engine.")
+ return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+ if selected_backend == AttentionBackendEnum.ROCM_ATTN:
+ logger.info("Using Rocm Attention backend on V1 engine.")
return AttentionBackendEnum.ROCM_ATTN.get_path()
- # default case, using triton unified attention
- logger.info("Using Triton Attention backend.")
- return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+ if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
+ if on_gfx9():
+ logger.info("Using Aiter Flash Attention backend on V1 engine.")
+ return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+ else:
+ raise ValueError(
+ f"The selected backend, {selected_backend.name}, "
+ "is only supported on gfx9 architectures."
+ )
+
+ if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
+ logger.info("Using Aiter Unified Attention backend on V1 engine.")
+ return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+ # Handle automatic backend selection based on environment variables
+ if selected_backend is None:
+ # Priority 1: Check for AITER Unified Attention (must check before MHA)
+ if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
+ logger.info("Using Aiter Unified Attention backend on V1 engine.")
+ return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+ # Priority 2: Check for AITER MHA (Flash Attention)
+ # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
+ if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
+ logger.info("Using Aiter Flash Attention backend on V1 engine.")
+ return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+ # Priority 3: Check for ROCM_ATTN (prefill-decode split)
+ if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
+ logger.info("Using Rocm Attention backend on V1 engine.")
+ return AttentionBackendEnum.ROCM_ATTN.get_path()
+
+ # Priority 4: Check for AITER enabled without specific flags
+ # This defaults to AITER FA only if MHA is not explicitly disabled
+ if (
+ envs.VLLM_ROCM_USE_AITER
+ and on_gfx9()
+ and envs.VLLM_ROCM_USE_AITER_MHA is not False
+ ):
+ logger.info("Using Aiter Flash Attention backend on V1 engine.")
+ return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+ # Default: Triton Unified Attention
+ logger.info("Using Triton Attention backend on V1 engine.")
+ return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+ raise RuntimeError(
+ "V0 attention backends have been removed. Set VLLM_USE_V1=1 "
+ "to select a supported backend."
+ )
@classmethod
def set_device(cls, device: torch.device) -> None:
From 656516c3158ef932c6a19a6aab9fdf4df74b105f Mon Sep 17 00:00:00 2001
From: Aydin Abiar <62435714+Aydin-ab@users.noreply.github.com>
Date: Mon, 24 Nov 2025 07:28:51 -0800
Subject: [PATCH 011/134] [Bugfix] properly handle nested json with llama3 tool
parser (#27701)
Signed-off-by: Aydin Abiar
Signed-off-by: Aydin Abiar <62435714+Aydin-ab@users.noreply.github.com>
Co-authored-by: Aydin Abiar
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Chauncey
---
.../test_llama3_json_tool_parser.py | 128 ++++++++++++++++++
.../openai/tool_parsers/llama_tool_parser.py | 116 ++++++++++------
2 files changed, 203 insertions(+), 41 deletions(-)
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 2b68a653f4600..37e52d2cdf609 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -1,6 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import MagicMock, patch
+
import pytest
from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
@@ -132,3 +134,129 @@ def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
assert result.tool_calls[0].function.name == "searchTool"
assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_deeply_nested_json(parser):
+ # Test with deeply nested JSON parameters (5 levels)
+ model_output = (
+ '{"name": "complexTool", '
+ '"parameters": {'
+ '"level1": {'
+ '"level2": {'
+ '"level3": {'
+ '"level4": {'
+ '"value": "deep"'
+ "}}}}}}"
+ )
+ result = parser.extract_tool_calls(model_output, None)
+
+ assert result.tools_called is True
+ assert len(result.tool_calls) == 1
+ assert result.tool_calls[0].function.name == "complexTool"
+ # Verify the nested structure is preserved in the arguments
+ import json
+
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
+
+
+def test_extract_tool_calls_multiple_with_deep_nesting(parser):
+ # Test with multiple tool calls where some have deeply nested parameters
+ model_output = (
+ '{"name": "simpleTool", "parameters": {"value": "test"}}; '
+ '{"name": "complexTool", "parameters": '
+ '{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
+ )
+ result = parser.extract_tool_calls(model_output, None)
+
+ assert result.tools_called is True
+ assert len(result.tool_calls) == 2
+
+ # Check first tool call
+ assert result.tool_calls[0].function.name == "simpleTool"
+ import json
+
+ args0 = json.loads(result.tool_calls[0].function.arguments)
+ assert args0["value"] == "test"
+
+ # Check second tool call with deep nesting
+ assert result.tool_calls[1].function.name == "complexTool"
+ args1 = json.loads(result.tool_calls[1].function.arguments)
+ assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
+
+
+def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
+ # Test with quotes and brackets inside quoted string values
+ model_output = (
+ '{"name": "searchTool", '
+ '"parameters": {'
+ '"query": "test {value} [complex]",'
+ '"nested": {"inner": "more {brackets}"}'
+ "}}"
+ )
+ result = parser.extract_tool_calls(model_output, None)
+
+ assert result.tools_called is True
+ assert len(result.tool_calls) == 1
+ assert result.tool_calls[0].function.name == "searchTool"
+ # Verify the string values are preserved including brackets and quotes
+ import json
+
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["query"] == "test {value} [complex]"
+ assert args["nested"]["inner"] == "more {brackets}"
+
+
+def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
+ # Test with escaped quotes in deeply nested JSON
+ model_output = (
+ '{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
+ )
+ result = parser.extract_tool_calls(model_output, None)
+
+ assert result.tools_called is True
+ assert len(result.tool_calls) == 1
+ assert result.tool_calls[0].function.name == "parserTool"
+ # Verify escaped quotes are preserved
+ import json
+
+ args = json.loads(result.tool_calls[0].function.arguments)
+ assert args["text"] == 'He said "Hello {world}"'
+
+
+def test_extract_tool_calls_missing_name_key(parser):
+ # Test that missing "name" key returns content
+ model_output = '{"parameters": {}}'
+ result = parser.extract_tool_calls(model_output, None)
+
+ assert result.tools_called is False
+ assert len(result.tool_calls) == 0
+ assert result.content == model_output
+
+
+def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
+ # Test that missing both "parameters" and "arguments" keys returns content
+ model_output = '{"name": "toolWithoutParams"}'
+ result = parser.extract_tool_calls(model_output, None)
+
+ assert result.tools_called is False
+ assert len(result.tool_calls) == 0
+ assert result.content == model_output
+
+
+def test_regex_timeout_handling(parser):
+ """Test regex timeout is handled gracefully"""
+ fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
+
+ # create a mock regex that raises TimeoutError
+ mock_regex = MagicMock()
+ mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
+
+ with patch.object(parser, "tool_call_start_regex", mock_regex):
+ result = parser.extract_tool_calls(fake_problematic_input, None)
+
+ # should treat as regular text when regex times out
+ assert result.content == fake_problematic_input
+ assert result.tools_called is False
+ assert len(result.tool_calls) == 0
+ mock_regex.finditer.assert_called_once()
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 02fc9b8a4d34e..e1fe6e90dfd0b 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -9,6 +9,7 @@ import regex as re
from partial_json_parser.core.options import Allow
from transformers import PreTrainedTokenizerBase
+import vllm.envs as envs
from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
@@ -56,12 +57,10 @@ class Llama3JsonToolParser(ToolParser):
self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[
0
]
- # Updated regex to match multiple JSONs separated by semicolons
- # This pattern is more robust and can handle nested JSON objects
- self.tool_call_regex = re.compile(
- r"{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*",
- re.DOTALL,
- )
+ # Simple regex to find opening braces - we'll use JSON decoder for parsing
+ # This handles arbitrary nesting depth correctly
+ self.tool_call_start_regex = re.compile(r"\{")
+ self.json_decoder = json.JSONDecoder()
def extract_tool_calls(
self, model_output: str, request: ChatCompletionRequest
@@ -77,49 +76,84 @@ class Llama3JsonToolParser(ToolParser):
tools_called=False, tool_calls=[], content=model_output
)
- # Find JSON object(s) in the text using regex
- match = self.tool_call_regex.search(model_output)
- if not match:
+ # Keep track of the end index of the last parsed JSON object
+ # so we don't parse inner brackets
+ end_index = -1
+ tool_calls: list[ToolCall] = []
+
+ try:
+ for match in self.tool_call_start_regex.finditer(
+ model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+ ):
+ start_index = match.start()
+ # Skip if this brace is inside a previously parsed JSON object
+ if start_index <= end_index:
+ continue
+
+ try:
+ obj, json_end_index = self.json_decoder.raw_decode(
+ model_output[start_index:]
+ )
+ end_index = start_index + json_end_index
+
+ # raise KeyError if missing
+ name = obj["name"]
+ arguments_or_params = (
+ obj["arguments"] if "arguments" in obj else obj["parameters"]
+ )
+
+ tool_calls.append(
+ ToolCall(
+ type="function",
+ function=FunctionCall(
+ name=name,
+ # function call args are JSON but as a string
+ arguments=json.dumps(
+ arguments_or_params, ensure_ascii=False
+ ),
+ ),
+ )
+ )
+ except KeyError as e:
+ # Missing required key
+ missing_key = str(e).strip("'\"")
+ logger.exception(
+ "Couldn't extract tool call from JSON response. "
+ "Required key '%s' not present. "
+ "Returning output in content with empty tool calls.",
+ missing_key,
+ )
+ return ExtractedToolCallInformation(
+ tools_called=False, tool_calls=[], content=model_output
+ )
+ except Exception:
+ # Any other error during parsing
+ logger.exception(
+ "Error in extracting tool call from response. "
+ "Returning output in content with empty tool calls"
+ )
+ return ExtractedToolCallInformation(
+ tools_called=False, tool_calls=[], content=model_output
+ )
+ except TimeoutError:
+ logger.warning("Regex timeout occurred when matching tool call pattern.")
+ logger.debug(
+ "Regex timeout occurred when matching user input: %s", model_output
+ )
return ExtractedToolCallInformation(
tools_called=False, tool_calls=[], content=model_output
)
- try:
- json_str = match.group(0)
- # Split by semicolon and strip whitespace
- json_objects = [obj.strip() for obj in json_str.split(";")]
-
- tool_calls: list[ToolCall] = []
- for json_obj in json_objects:
- if not json_obj: # Skip empty strings
- continue
- obj = json.loads(json_obj)
- tool_calls.append(
- ToolCall(
- type="function",
- function=FunctionCall(
- name=obj["name"],
- # function call args are JSON but as a string
- arguments=json.dumps(
- obj["arguments"]
- if "arguments" in obj
- else obj["parameters"],
- ensure_ascii=False,
- ),
- ),
- )
- )
-
+ # If we have valid tool calls, return them normally
+ if tool_calls:
return ExtractedToolCallInformation(
tools_called=True, tool_calls=tool_calls, content=None
)
- except Exception:
- logger.exception("Error in extracting tool call from response.")
- # return information to just treat the tool call as regular JSON
- return ExtractedToolCallInformation(
- tools_called=False, tool_calls=[], content=model_output
- )
+ # No valid tool calls found
+ return ExtractedToolCallInformation(
+ tools_called=False, tool_calls=[], content=model_output
+ )
def extract_tool_calls_streaming(
self,
From e924bbb4f4ac3258a71a18ac4c753c8056bc059f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath
Date: Mon, 24 Nov 2025 11:06:17 -0500
Subject: [PATCH 012/134] [Build/CI][DP/EP] Add QWen/Qwen3-30B-A3B-FP8 + EPLB
tests to Nightly H100 and B200 (#29195)
Signed-off-by: Varun Sundar Rabindranath
Co-authored-by: Varun Sundar Rabindranath
---
...block_ep.sh => qwen30b_a3b_fp8_block_ep_eplb.sh} | 10 +++++++---
.buildkite/test-amd.yaml | 2 +-
.buildkite/test-pipeline.yaml | 13 +++++++++++--
3 files changed, 19 insertions(+), 6 deletions(-)
rename .buildkite/scripts/scheduled_integration_test/{qwen30b_a3b_fp8_block_ep.sh => qwen30b_a3b_fp8_block_ep_eplb.sh} (82%)
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
similarity index 82%
rename from .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
rename to .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index 0d06f53a183d0..6a1bef275d047 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -1,10 +1,12 @@
#!/usr/bin/env bash
set -euxo pipefail
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
THRESHOLD=${1:-0.8}
NUM_Q=${2:-1319}
PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
mkdir -p "${OUT_DIR}"
@@ -45,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
- --tensor-parallel-size 2 \
- --data-parallel-size 2 \
+ --enable-eplb \
+ --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+ --data-parallel-size ${DATA_PARALLEL_SIZE} \
--enable-expert-parallel \
--trust-remote-code \
--max-model-len 2048 \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index f098e23866eb3..4ddf11c0b268f 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1486,4 +1486,4 @@ steps:
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7a46e919f93bf..f1cd39ef4f948 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1340,11 +1340,20 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
-- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
timeout_in_minutes: 60
gpu: h100
optional: true
num_gpus: 4
working_dir: "/vllm-workspace"
commands:
- - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+ timeout_in_minutes: 60
+ gpu: b200
+ optional: true
+ num_gpus: 2
+ working_dir: "/vllm-workspace"
+ commands:
+ - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
From 26a465584ae14a04a1f6e9b36621d70e9d907c10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?=
Date: Mon, 24 Nov 2025 18:18:04 +0100
Subject: [PATCH 013/134] [NIXL] Use config to enable telemetry + NIXL version
bump (#29305)
Signed-off-by: NickLucche
---
requirements/kv_connectors.txt | 2 +-
.../kv_transfer/kv_connector/v1/nixl_connector.py | 9 +++------
2 files changed, 4 insertions(+), 7 deletions(-)
diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index b1f3269cd3813..083230c171096 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
lmcache
-nixl >= 0.6.0 # Required for disaggregated prefill
+nixl >= 0.7.1 # Required for disaggregated prefill
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 7c0911240493c..493938d4aad92 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -4,7 +4,6 @@ import contextlib
import copy
import logging
import math
-import os
import queue
import threading
import time
@@ -810,9 +809,6 @@ class NixlConnectorWorker:
self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
"backends", ["UCX"]
)
- # TODO temporary, once nixl allows for telemetry flag in config
- # (next release), we can remove this env var.
- os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
# Agent.
non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
@@ -828,10 +824,11 @@ class NixlConnectorWorker:
if nixl_agent_config is None:
config = None
else:
+ # Enable telemetry by default for NIXL 0.7.1 and above.
config = (
- nixl_agent_config(backends=self.nixl_backends)
+ nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
if len(non_ucx_backends) > 0
- else nixl_agent_config(num_threads=num_threads)
+ else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
)
self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
From cc313cb73d75cd5ac2715fc45bfadb89888cf8cd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon
Date: Mon, 24 Nov 2025 09:32:27 -0800
Subject: [PATCH 014/134] [Model Runner V2] Implement Single-step Eagle 1
(#29300)
Signed-off-by: Woosuk Kwon
---
vllm/v1/worker/gpu/input_batch.py | 3 +
vllm/v1/worker/gpu/model_runner.py | 79 +++++++++
vllm/v1/worker/gpu/sampler.py | 5 +-
vllm/v1/worker/gpu/spec_decode/__init__.py | 18 ++
vllm/v1/worker/gpu/spec_decode/eagle.py | 197 +++++++++++++++++++++
5 files changed, 300 insertions(+), 2 deletions(-)
create mode 100644 vllm/v1/worker/gpu/spec_decode/eagle.py
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 7675cb45170b5..1177d25e300cf 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -37,6 +37,9 @@ class InputBuffers:
self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
+ # Spec decoding.
+ self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32)
+
# Structured outputs.
self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
self.grammar_bitmask = self._make_buffer(
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 6e332ee4b75b8..205298a415d43 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -45,6 +45,7 @@ from vllm.v1.worker.gpu.input_batch import (
prepare_prefill_inputs,
)
from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs
+from vllm.v1.worker.gpu.spec_decode import init_speculator
from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample
from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata
from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
@@ -97,16 +98,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
if self.use_async_scheduling:
self.input_prep_event = torch.cuda.Event()
self.structured_outputs_event = torch.cuda.Event()
+ self.spec_decode_event = torch.cuda.Event()
else:
self.input_prep_event = None
self.structured_outputs_event = None
+ self.spec_decode_event = None
if self.speculative_config is not None:
self.do_spec_decode = True
self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+ self.speculator = init_speculator(self.vllm_config, self.device)
else:
self.do_spec_decode = False
self.num_speculative_steps = 0
+ self.speculator = None
self.req_states = RequestState(
max_num_reqs=self.max_num_reqs,
@@ -153,6 +158,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.vllm_config,
self.device,
)
+ if self.do_spec_decode:
+ self.speculator.load_model(self.model)
time_after_load = time.perf_counter()
self.model_memory_usage = m.consumed_memory
@@ -285,6 +292,33 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
logits = self.model.compute_logits(hidden_states)
self.sampler(logits, sampling_metadata)
+ @torch.inference_mode()
+ def _dummy_speculator_run(
+ self,
+ hidden_states: torch.Tensor,
+ aux_hidden_states: list[torch.Tensor] | None,
+ ) -> None:
+ num_tokens = hidden_states.shape[0]
+ num_reqs = min(num_tokens, self.max_num_reqs)
+ input_batch = InputBatch.make_dummy(
+ num_reqs=num_reqs,
+ num_tokens=num_tokens,
+ input_buffers=self.input_buffers,
+ device=self.device,
+ )
+ sampling_metadata = SamplingMetadata.make_dummy(
+ num_reqs=num_reqs,
+ device=self.device,
+ )
+ num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device)
+ self.propose_draft(
+ input_batch=input_batch,
+ sampling_metadata=sampling_metadata,
+ last_hidden_states=hidden_states,
+ aux_hidden_states=aux_hidden_states,
+ num_sampled=num_sampled,
+ )
+
@torch.inference_mode()
def profile_run(self) -> None:
hidden_states, sample_hidden_states = self._dummy_run(
@@ -292,6 +326,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
skip_attn=True,
)
self._dummy_sampler_run(sample_hidden_states)
+ if self.do_spec_decode:
+ self._dummy_speculator_run(hidden_states, None)
torch.cuda.synchronize()
del hidden_states, sample_hidden_states
gc.collect()
@@ -727,6 +763,41 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.req_states.prefill_len.np[idx_mapping_np],
)
+ @torch.inference_mode()
+ def propose_draft(
+ self,
+ input_batch: InputBatch,
+ sampling_metadata: SamplingMetadata,
+ last_hidden_states: torch.Tensor,
+ aux_hidden_states: list[torch.Tensor] | None,
+ num_sampled: torch.Tensor,
+ ) -> torch.Tensor:
+ num_reqs = input_batch.num_reqs
+ idx_mapping_np = input_batch.idx_mapping_np
+ with async_barrier(self.spec_decode_event):
+ self.input_buffers.next_prefill_tokens.np[:num_reqs] = (
+ self.req_states.prefill_token_ids[
+ idx_mapping_np,
+ self.req_states.num_computed_prefill_tokens[idx_mapping_np],
+ ]
+ )
+ next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu(
+ num_reqs
+ )
+
+ assert self.speculator is not None
+ draft_tokens = self.speculator.propose(
+ input_batch,
+ sampling_metadata,
+ last_hidden_states,
+ aux_hidden_states,
+ num_sampled,
+ self.req_states.last_sampled_tokens,
+ next_prefill_tokens,
+ )
+ self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
+ return draft_tokens
+
def get_cudagraph_and_dp_padding(
self,
scheduler_output: SchedulerOutput,
@@ -913,6 +984,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
self.postprocess(
input_batch, sampler_output.sampled_token_ids, num_sampled_tokens
)
+ if self.do_spec_decode:
+ _ = self.propose_draft(
+ input_batch,
+ sampling_metadata,
+ hidden_states,
+ None, # aux_hidden_states
+ num_sampled_tokens,
+ )
if self.use_async_scheduling:
return async_output
diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py
index c48ed2d8ca167..d8676079ab951 100644
--- a/vllm/v1/worker/gpu/sampler.py
+++ b/vllm/v1/worker/gpu/sampler.py
@@ -100,8 +100,9 @@ def _gumbel_sample_kernel(
mask=mask,
other=float("-inf"),
)
+ logits = logits.to(tl.float32)
- temp = tl.load(temp_ptr + req_idx)
+ temp = tl.load(temp_ptr + req_idx).to(tl.float32)
if temp != 0.0:
# Calculate the seed for gumbel noise.
seed = tl.load(seeds_ptr + req_idx)
@@ -116,7 +117,7 @@ def _gumbel_sample_kernel(
# Apply temperature.
if APPLY_TEMPERATURE:
# NOTE(woosuk): Use div_rn to match the behavior of torch.
- logits = tl.div_rn(logits, temp.to(tl.float32))
+ logits = tl.div_rn(logits, temp)
# Apply gumbel noise.
logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
diff --git a/vllm/v1/worker/gpu/spec_decode/__init__.py b/vllm/v1/worker/gpu/spec_decode/__init__.py
index e69de29bb2d1d..15b85204e05ce 100644
--- a/vllm/v1/worker/gpu/spec_decode/__init__.py
+++ b/vllm/v1/worker/gpu/spec_decode/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.config import VllmConfig
+
+
+def init_speculator(
+ vllm_config: VllmConfig,
+ device: torch.device,
+):
+ speculative_config = vllm_config.speculative_config
+ assert speculative_config is not None
+ if speculative_config.use_eagle():
+ from vllm.v1.worker.gpu.spec_decode.eagle import EagleSpeculator
+
+ return EagleSpeculator(vllm_config, device)
+ raise NotImplementedError(f"{speculative_config.method} is not supported yet.")
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
new file mode 100644
index 0000000000000..0f11903e14540
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.forward_context import set_forward_context
+from vllm.model_executor.model_loader import get_model
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.sampler import gumbel_sample
+from vllm.v1.worker.gpu.states import SamplingMetadata
+
+
+class EagleSpeculator:
+ def __init__(self, vllm_config: VllmConfig, device: torch.device):
+ self.vllm_config = vllm_config
+ self.device = device
+
+ self.speculative_config = vllm_config.speculative_config
+ assert self.speculative_config is not None
+ self.method = self.speculative_config.method
+ self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+ self.draft_model_config = self.speculative_config.draft_model_config
+
+ self.scheduler_config = vllm_config.scheduler_config
+ self.max_num_reqs = self.scheduler_config.max_num_seqs
+ self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+
+ self.input_ids = torch.zeros(
+ self.max_num_tokens, dtype=torch.int32, device=device
+ )
+ self.positions = torch.zeros(
+ self.max_num_tokens, dtype=torch.int64, device=device
+ )
+
+ def load_model(self, target_model: nn.Module) -> None:
+ from vllm.compilation.backends import set_model_tag
+
+ with set_model_tag("eagle_head"):
+ self.model = get_model(
+ vllm_config=self.vllm_config, model_config=self.draft_model_config
+ )
+
+ share_lm_head = True
+ if share_lm_head and hasattr(target_model, "lm_head"):
+ if hasattr(self.model, "lm_head"):
+ del self.model.lm_head
+ self.model.lm_head = target_model.lm_head
+
+ @torch.inference_mode()
+ def propose(
+ self,
+ input_batch: InputBatch,
+ sampling_metadata: SamplingMetadata,
+ # [num_tokens, hidden_size]
+ last_hidden_states: torch.Tensor,
+ # num_layers x [num_tokens, hidden_size]
+ aux_hidden_states: list[torch.Tensor] | None,
+ # [num_reqs]
+ num_sampled: torch.Tensor,
+ # [max_num_reqs, 1]
+ last_sampled: torch.Tensor,
+ # [num_reqs]
+ next_prefill_tokens: torch.Tensor,
+ ) -> torch.Tensor:
+ if aux_hidden_states:
+ assert self.method == "eagle3"
+ hidden_states = self.model.combine_hidden_states(
+ torch.cat(aux_hidden_states, dim=-1)
+ )
+ else:
+ hidden_states = last_hidden_states
+
+ # Get the input ids and last token indices for the speculator.
+ last_token_indices = prepare_eagle_inputs(
+ self.input_ids,
+ input_batch,
+ num_sampled,
+ last_sampled,
+ next_prefill_tokens,
+ )
+ input_ids = self.input_ids[: input_batch.num_tokens_after_padding]
+
+ # Prefill: Run the eagle speculator with eager mode.
+ with set_forward_context(
+ input_batch.attn_metadata,
+ self.vllm_config,
+ num_tokens=input_batch.num_tokens_after_padding,
+ cudagraph_runtime_mode=CUDAGraphMode.NONE,
+ ):
+ ret_hidden_states = self.model(
+ input_ids=input_ids,
+ positions=input_batch.positions,
+ hidden_states=hidden_states,
+ )
+ if self.method == "mtp":
+ last_hidden_states = ret_hidden_states
+ hidden_states = ret_hidden_states
+ else:
+ last_hidden_states, hidden_states = ret_hidden_states
+ sample_hidden_states = last_hidden_states[last_token_indices]
+ logits = self.model.compute_logits(sample_hidden_states)
+
+ num_reqs = input_batch.num_reqs
+ cu_num_logits = input_batch.cu_num_logits[:num_reqs]
+ temperature = sampling_metadata.temperature[cu_num_logits]
+ seed = sampling_metadata.seeds[cu_num_logits]
+ # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+ # used for draft and target sampling.
+ pos = input_batch.positions[last_token_indices] + 1
+ draft_tokens = gumbel_sample(
+ logits, temperature, seed, pos, apply_temperature=True
+ )
+ if self.num_speculative_steps == 1:
+ # Early exit.
+ return draft_tokens.view(-1, 1)
+ raise NotImplementedError("num_speculative_steps > 1 is not supported yet.")
+
+
+@triton.jit
+def _prepare_eagle_inputs_kernel(
+ last_token_indices_ptr,
+ eagle_input_ids_ptr,
+ target_input_ids_ptr,
+ idx_mapping_ptr,
+ last_sampled_ptr,
+ next_prefill_tokens_ptr,
+ num_sampled_ptr,
+ query_start_loc_ptr,
+ cu_num_logits_ptr,
+ BLOCK_SIZE: tl.constexpr,
+):
+ batch_idx = tl.program_id(0)
+ query_start = tl.load(query_start_loc_ptr + batch_idx)
+ query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+ query_len = query_end - query_start
+
+ # Get the true query length and next token after accounting for rejected tokens.
+ num_sampled = tl.load(num_sampled_ptr + batch_idx)
+ if num_sampled > 0:
+ req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+ next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32)
+
+ logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+ logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+ num_logits = logits_end - logits_start
+
+ num_rejected = num_logits - num_sampled
+ query_len -= num_rejected
+ else:
+ # Chunked prefilling.
+ # Get the next prefill token.
+ next_token = tl.load(next_prefill_tokens_ptr + batch_idx)
+
+ # Shift target_input_ids by one.
+ for i in range(1, query_len, BLOCK_SIZE):
+ block = i + tl.arange(0, BLOCK_SIZE)
+ mask = block < query_len
+ input_ids = tl.load(target_input_ids_ptr + query_start + block, mask=mask)
+ tl.store(eagle_input_ids_ptr + query_start + block - 1, input_ids, mask=mask)
+
+ last_token_index = query_start + query_len - 1
+ tl.store(last_token_indices_ptr + batch_idx, last_token_index)
+ tl.store(eagle_input_ids_ptr + last_token_index, next_token)
+
+
+def prepare_eagle_inputs(
+ eagle_input_ids: torch.Tensor,
+ input_batch: InputBatch,
+ # [num_reqs]
+ num_sampled: torch.Tensor,
+ # [max_num_reqs, 1]
+ last_sampled: torch.Tensor,
+ # [max_num_reqs]
+ next_prefill_tokens: torch.Tensor,
+) -> torch.Tensor:
+ num_reqs = input_batch.num_reqs
+ last_token_indices = torch.empty(
+ num_reqs,
+ dtype=torch.int64,
+ device=eagle_input_ids.device,
+ )
+ _prepare_eagle_inputs_kernel[(num_reqs,)](
+ last_token_indices,
+ eagle_input_ids,
+ input_batch.input_ids,
+ input_batch.idx_mapping,
+ last_sampled,
+ next_prefill_tokens,
+ num_sampled,
+ input_batch.query_start_loc,
+ input_batch.cu_num_logits,
+ BLOCK_SIZE=1024,
+ )
+ return last_token_indices
From cec418b5df3bf032a83b6a6795e8026d39e199bd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon
Date: Mon, 24 Nov 2025 09:34:37 -0800
Subject: [PATCH 015/134] [Model Runner V2] Change Numba AoT to JIT (#29328)
Signed-off-by: Woosuk Kwon
---
vllm/v1/worker/gpu/input_batch.py | 71 +++++++-----------------------
vllm/v1/worker/gpu/model_runner.py | 24 ++++++----
2 files changed, 32 insertions(+), 63 deletions(-)
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index 1177d25e300cf..3ac43ea4952de 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -4,7 +4,6 @@ from dataclasses import dataclass
from typing import Any
import numba
-import numba.types as types
import numpy as np
import torch
@@ -147,80 +146,42 @@ class InputBatch:
)
-# NOTE: With the type annotations, this function is pre-compiled
-# before the first call.
-@numba.jit(
- [
- types.none(
- types.int32[:], # idx_mapping
- types.int32[:], # num_scheduled_tokens
- types.int32[:, :], # prefill_token_ids
- types.int32[:], # num_computed_prefill_tokens
- types.int32[:], # prefill_len
- types.int32[:], # input_ids
- types.int32[:], # query_start_loc
- )
- ],
- nopython=True,
- cache=True,
-)
+@numba.njit(cache=True)
def _prepare_prefill_inputs(
- idx_mapping: np.ndarray, # batch_idx -> req_idx
- num_scheduled_tokens: np.ndarray, # [B]
+ idx_mapping: np.ndarray, # [B]
+ query_lens: np.ndarray, # [B]
+ query_start_loc: np.ndarray, # [B + 1]
prefill_token_ids: np.ndarray, # [N, max_model_len]
num_computed_prefill_tokens: np.ndarray, # [N]
- prefill_len: np.ndarray, # [N]
input_ids: np.ndarray, # [num_input_tokens]
- query_start_loc: np.ndarray, # [B + 1]
) -> None:
- num_reqs = num_scheduled_tokens.shape[0]
- query_start_loc[0] = 0
-
- cu_num_tokens = 0
+ num_reqs = idx_mapping.shape[0]
+ query_starts = query_start_loc[:num_reqs]
+ query_ends = query_start_loc[1 : num_reqs + 1]
+ starts = num_computed_prefill_tokens[idx_mapping]
+ ends = starts + query_lens
for i in range(num_reqs):
- req_idx = idx_mapping[i]
- query_len = num_scheduled_tokens[i]
-
- start = num_computed_prefill_tokens[req_idx]
- end = min(start + query_len, prefill_len[req_idx])
- n = end - start
-
- start_idx = cu_num_tokens
- input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end]
-
- cu_num_tokens = start_idx + query_len
- query_start_loc[i + 1] = cu_num_tokens
-
- # Pad the inputs for CUDA graphs.
- # Note: pad query_start_loc to be non-decreasing, as kernels
- # like FlashAttention requires that
- query_start_loc[num_reqs + 1 :].fill(cu_num_tokens)
+ input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[
+ idx_mapping[i], starts[i] : ends[i]
+ ]
def prepare_prefill_inputs(
idx_mapping: np.ndarray,
num_scheduled_tokens: np.ndarray,
- total_num_tokens: int,
+ query_start_loc: np.ndarray,
prefill_token_ids: np.ndarray,
num_computed_prefill_tokens: np.ndarray,
- prefill_len: np.ndarray,
- input_ids: CpuGpuBuffer,
- query_start_loc: CpuGpuBuffer,
+ input_ids: np.ndarray,
) -> None:
_prepare_prefill_inputs(
idx_mapping,
num_scheduled_tokens,
+ query_start_loc,
prefill_token_ids,
num_computed_prefill_tokens,
- prefill_len,
- input_ids.np,
- query_start_loc.np,
+ input_ids,
)
- input_ids.copy_to_gpu(total_num_tokens)
- # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens
- # tensors from CPU to GPU, because they may include paddings needed
- # for full CUDA graph mode.
- query_start_loc.copy_to_gpu()
@triton.jit
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 205298a415d43..e0ed183d3c5b0 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -502,20 +502,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
block_tables = self.block_tables.gather_block_tables(idx_mapping)
- # Copy prefill tokens from CPU to GPU and get query_start_loc.
+ # Get query_start_loc.
+ np.cumsum(
+ num_scheduled_tokens,
+ out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
+ )
+ # Pad for full CUDA graph mode.
+ # Some attention backends like FA3 require query_start_loc to be non-decreasing.
+ self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
+ self.input_buffers.query_start_loc.copy_to_gpu()
+ query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+ query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
+
+ # Copy prefill tokens from CPU to GPU.
prepare_prefill_inputs(
idx_mapping_np,
num_scheduled_tokens,
- num_tokens,
+ query_start_loc_np,
self.req_states.prefill_token_ids,
self.req_states.num_computed_prefill_tokens,
- self.req_states.prefill_len.np,
- self.input_buffers.input_ids,
- self.input_buffers.query_start_loc,
+ self.input_buffers.input_ids.np,
)
- query_start_loc = self.input_buffers.query_start_loc
- query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
- query_start_loc_np = query_start_loc.np[: num_reqs + 1]
+ self.input_buffers.input_ids.copy_to_gpu(num_tokens)
# Prepare positions and seq_lens.
prepare_pos_seq_lens(
From 8f066146c395dfadb86914c88d9a0f3173f8fa39 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Mon, 24 Nov 2025 13:38:04 -0500
Subject: [PATCH 016/134] [MoE][Refactor] Make select_experts a non-static
method (#29067)
Signed-off-by: Bill Nell
---
tests/kernels/moe/test_flashinfer.py | 19 +--
tests/test_routing_simulator.py | 35 ++++-
.../layers/fused_moe/fused_moe_method_base.py | 6 +-
.../fused_moe/fused_moe_modular_method.py | 41 +-----
vllm/model_executor/layers/fused_moe/layer.py | 118 +++++++++--------
.../fused_moe/unquantized_fused_moe_method.py | 32 +----
.../layers/quantization/awq_marlin.py | 17 +--
.../layers/quantization/bitsandbytes.py | 20 +--
.../compressed_tensors_moe.py | 123 +++---------------
.../layers/quantization/experts_int8.py | 19 +--
.../model_executor/layers/quantization/fp8.py | 29 +----
.../layers/quantization/gguf.py | 17 +--
.../layers/quantization/gptq_marlin.py | 19 +--
.../layers/quantization/modelopt.py | 45 ++-----
.../layers/quantization/moe_wna16.py | 17 +--
.../layers/quantization/mxfp4.py | 23 +---
.../layers/quantization/quark/quark_moe.py | 38 +-----
.../model_executor/layers/quantization/rtn.py | 17 +--
18 files changed, 163 insertions(+), 472 deletions(-)
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 638741e91619b..a6977f222408d 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -11,7 +11,6 @@ from vllm.model_executor.layers.fused_moe.config import (
fp8_w8a8_moe_quant_config,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
apply_flashinfer_per_tensor_scale_fp8,
flashinfer_cutlass_moe_fp8,
@@ -151,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids = Llama4MoE.custom_routing_function(
hidden_states=td.hidden_states,
- router_logits=score,
- use_grouped_topk=False,
- top_k=topk,
+ gating_output=score,
+ topk=topk,
renormalize=False,
- custom_routing_function=Llama4MoE.custom_routing_function,
- scoring_func="softmax",
)
quant_config = fp8_w8a8_moe_quant_config(
@@ -219,14 +215,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
)
score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids = Llama4MoE.custom_routing_function(
hidden_states=td.hidden_states,
- router_logits=score,
- use_grouped_topk=False,
- top_k=topk,
+ gating_output=score,
+ topk=topk,
renormalize=False,
- custom_routing_function=Llama4MoE.custom_routing_function,
- scoring_func="softmax",
)
quant_config = fp8_w8a8_moe_quant_config(
diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index 5a162fa8f791b..e8826eb441a24 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -9,9 +9,16 @@ different routing strategies and analyze their performance, including
integration tests with FusedMoE layer.
"""
+import tempfile
+
import pytest
import torch
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed import (
+ init_distributed_environment,
+ initialize_model_parallel,
+)
from vllm.model_executor.layers.fused_moe.routing_simulator import (
DistributionBasedRouting,
RoutingSimulator,
@@ -89,6 +96,28 @@ def test_routing_strategy_integration(monkeypatch, device):
# Test different routing strategies
strategies = RoutingSimulator.get_available_strategies()
+ vllm_config = VllmConfig()
+ with set_current_vllm_config(vllm_config):
+ temp_file = tempfile.mkstemp()[1]
+ init_distributed_environment(
+ world_size=1,
+ rank=0,
+ local_rank=0,
+ distributed_init_method=f"file://{temp_file}",
+ )
+ initialize_model_parallel(
+ tensor_model_parallel_size=1,
+ pipeline_model_parallel_size=1,
+ )
+ fused_moe = FusedMoE(
+ num_experts=num_experts,
+ top_k=top_k,
+ hidden_size=hidden_size,
+ intermediate_size=0,
+ use_grouped_topk=False,
+ renormalize=True,
+ )
+
for strategy in strategies:
# Set environment variable
env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY"
@@ -98,13 +127,9 @@ def test_routing_strategy_integration(monkeypatch, device):
envs.environment_variables[env_name] = lambda s=strategy: s
# Test the select_experts method
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = fused_moe.select_experts(
hidden_states=hidden_states,
router_logits=router_logits,
- top_k=top_k,
- use_grouped_topk=False,
- renormalize=True,
- indices_type=torch.long,
)
# Verify output shapes
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 073e90a4e6808..ef7090c349fc6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -90,10 +90,14 @@ class FusedMoEMethodBase(QuantizeMethodBase):
def allow_inplace(self) -> bool:
return False
+ @property
+ def method_name(self) -> str:
+ return self.__class__.__name__
+
@abstractmethod
def apply(
self,
- layer: torch.nn.Module,
+ layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index c6dc95acdb636..c23c41df226f0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -66,6 +66,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
def allow_inplace(self) -> bool:
return self.old_quant_method.allow_inplace
+ @property
+ def method_name(self) -> str:
+ return self.old_quant_method.method_name
+
def create_weights(
self,
layer: torch.nn.Module,
@@ -84,7 +88,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
def apply(
self,
- layer: torch.nn.Module,
+ layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -105,42 +109,9 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- # Is getattr needed?
- zero_expert_num = getattr(layer, "zero_expert_num", 0)
- zero_expert_type = getattr(layer, "zero_expert_type", None)
-
- if enable_eplb:
- if self.supports_eplb:
- assert expert_load_view is not None
- assert logical_to_physical_map is not None
- assert logical_replica_count is not None
- else:
- raise NotImplementedError(
- "EPLB is not supported for "
- f"{self.old_quant_method.__class__.__name__}."
- )
-
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
- enable_eplb=enable_eplb,
- expert_map=expert_map,
- expert_load_view=expert_load_view,
- logical_to_physical_map=logical_to_physical_map,
- logical_replica_count=logical_replica_count,
- global_num_experts=global_num_experts,
- zero_expert_num=zero_expert_num,
- zero_expert_type=zero_expert_type,
)
result = self.fused_experts(
@@ -156,7 +127,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
expert_map=None if self.disable_expert_map else expert_map,
)
- if zero_expert_num != 0 and zero_expert_type is not None:
+ if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 6619b64b2bbc0..0ef3130b26333 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1510,30 +1510,11 @@ class FusedMoE(CustomOp):
logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
)
- @staticmethod
def select_experts(
+ self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
- top_k: int,
- use_grouped_topk: bool,
- renormalize: bool,
- topk_group: int | None = None,
- num_expert_group: int | None = None,
- custom_routing_function: Callable | None = None,
- scoring_func: str = "softmax",
- routed_scaling_factor: float = 1.0,
- e_score_correction_bias: torch.Tensor | None = None,
- indices_type: torch.dtype | None = None,
- enable_eplb: bool = False,
- expert_map: torch.Tensor | None = None,
- expert_load_view: torch.Tensor | None = None,
- logical_to_physical_map: torch.Tensor | None = None,
- logical_replica_count: torch.Tensor | None = None,
- global_num_experts: int | None = None,
- zero_expert_num: int | None = None,
- zero_expert_type: str | None = None,
- num_fused_shared_experts: int = 0,
- ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
"""
Route the input hidden states to the top-k experts based on the
router logits.
@@ -1552,6 +1533,27 @@ class FusedMoE(CustomOp):
fused_topk_bias,
)
+ if self.enable_eplb:
+ if self.quant_method.supports_eplb:
+ if self.expert_load_view is None:
+ raise ValueError(
+ "enable_eplb=True requiere expert_load_view != None"
+ )
+ if self.logical_to_physical_map is None:
+ raise ValueError(
+ "enable_eplb=True requiere logical_to_physical_map != None"
+ )
+ if self.logical_replica_count is None:
+ raise ValueError(
+ "enable_eplb=True requiere logical_replica_count != None"
+ )
+ else:
+ raise NotImplementedError(
+ f"EPLB is not supported for {self.quant_method.method_name}."
+ )
+
+ indices_type = self.quant_method.topk_indices_dtype
+
# Check if we should use a routing simulation strategy
routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
if routing_strategy != "":
@@ -1559,20 +1561,20 @@ class FusedMoE(CustomOp):
hidden_states=hidden_states,
router_logits=router_logits,
strategy_name=routing_strategy,
- top_k=top_k,
+ top_k=self.top_k,
indices_type=indices_type,
)
# DeepSeekv2 uses grouped_top_k
- elif use_grouped_topk:
- assert topk_group is not None
- assert num_expert_group is not None
+ elif self.use_grouped_topk:
+ assert self.topk_group is not None
+ assert self.num_expert_group is not None
if rocm_aiter_ops.is_fused_moe_enabled():
if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
- assert num_fused_shared_experts == 0
+ assert self.num_fused_shared_experts == 0
grouped_topk_impl = partial(
rocm_aiter_grouped_topk,
- num_fused_shared_experts=num_fused_shared_experts,
+ num_fused_shared_experts=self.num_fused_shared_experts,
)
else:
grouped_topk_impl = grouped_topk
@@ -1580,50 +1582,46 @@ class FusedMoE(CustomOp):
topk_weights, topk_ids = grouped_topk_impl(
hidden_states=hidden_states,
gating_output=router_logits,
- topk=top_k,
- renormalize=renormalize,
- num_expert_group=num_expert_group,
- topk_group=topk_group,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
+ topk=self.top_k,
+ renormalize=self.renormalize,
+ num_expert_group=self.num_expert_group,
+ topk_group=self.topk_group,
+ scoring_func=self.scoring_func,
+ routed_scaling_factor=self.routed_scaling_factor,
+ e_score_correction_bias=self.e_score_correction_bias,
)
- elif e_score_correction_bias is not None:
+ elif self.e_score_correction_bias is not None:
topk_weights, topk_ids = fused_topk_bias(
hidden_states=hidden_states,
gating_output=router_logits,
- e_score_correction_bias=e_score_correction_bias.data,
- topk=top_k,
- renormalize=renormalize,
+ e_score_correction_bias=self.e_score_correction_bias.data,
+ topk=self.top_k,
+ renormalize=self.renormalize,
)
- if routed_scaling_factor != 1.0:
- topk_weights *= routed_scaling_factor
- elif custom_routing_function is None:
+ if self.routed_scaling_factor != 1.0:
+ topk_weights *= self.routed_scaling_factor
+ elif self.custom_routing_function is None:
topk_weights, topk_ids, token_expert_indices = fused_topk(
hidden_states=hidden_states,
gating_output=router_logits,
- topk=top_k,
- renormalize=renormalize,
+ topk=self.top_k,
+ renormalize=self.renormalize,
indices_type=indices_type,
)
else:
- topk_weights, topk_ids = custom_routing_function(
+ topk_weights, topk_ids = self.custom_routing_function(
hidden_states=hidden_states,
gating_output=router_logits,
- topk=top_k,
- renormalize=renormalize,
+ topk=self.top_k,
+ renormalize=self.renormalize,
)
- if enable_eplb:
- assert expert_load_view is not None
- assert logical_to_physical_map is not None
- assert logical_replica_count is not None
-
+ if self.enable_eplb:
topk_ids = eplb_map_to_physical_and_record(
topk_ids=topk_ids,
- expert_load_view=expert_load_view,
- logical_to_physical_map=logical_to_physical_map,
- logical_replica_count=logical_replica_count,
+ expert_load_view=self.expert_load_view,
+ logical_to_physical_map=self.logical_to_physical_map,
+ logical_replica_count=self.logical_replica_count,
)
if (indices_type is not None) and topk_ids.dtype != indices_type:
@@ -1633,16 +1631,16 @@ class FusedMoE(CustomOp):
# Compute zero expert result if needed
if (
- zero_expert_num is not None
- and zero_expert_num > 0
- and zero_expert_type is not None
- and global_num_experts is not None
+ self.zero_expert_num is not None
+ and self.zero_expert_num > 0
+ and self.zero_expert_type is not None
+ and self.global_num_experts is not None
):
zero_expert_result = zero_experts_compute_triton(
expert_indices=topk_ids,
expert_scales=topk_weights,
- num_experts=global_num_experts,
- zero_expert_type=zero_expert_type,
+ num_experts=self.global_num_experts,
+ zero_expert_type=self.zero_expert_type,
hidden_states=hidden_states,
)
else:
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 63b0e6f573d65..48e5a8907f926 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -331,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
def forward_cuda(
self,
- layer: torch.nn.Module,
+ layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
@@ -352,31 +352,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- zero_expert_num = getattr(layer, "zero_expert_num", 0)
- zero_expert_type = getattr(layer, "zero_expert_type", None)
-
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
- enable_eplb=enable_eplb,
- expert_map=expert_map,
- expert_load_view=expert_load_view,
- logical_to_physical_map=logical_to_physical_map,
- logical_replica_count=logical_replica_count,
- global_num_experts=global_num_experts,
- zero_expert_num=zero_expert_num,
- zero_expert_type=zero_expert_type,
- num_fused_shared_experts=layer.num_fused_shared_experts,
)
if self.rocm_aiter_moe_enabled:
@@ -415,7 +393,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
expert_map=expert_map,
)
- if zero_expert_num != 0 and zero_expert_type is not None:
+ if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
@@ -425,7 +403,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
def forward_cpu(
self,
- layer: torch.nn.Module,
+ layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
@@ -474,7 +452,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
def forward_xpu(
self,
- layer: torch.nn.Module,
+ layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
@@ -515,7 +493,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
def forward_tpu(
self,
- layer: torch.nn.Module,
+ layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3f6ea68072b40..66945e2d2a7c8 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -597,7 +597,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -618,24 +618,11 @@ class AWQMoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
-
assert activation == "silu", "Only SiLU activation is supported."
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index e5a741e639ad9..1e57fa218b797 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -495,7 +495,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -518,25 +518,11 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `BitsAndBytesMoEMethod` yet."
- )
-
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
+ # TODO(bnell): Do these need to be called on the hot path?
if self.quant_config.load_in_8bit:
w13, w2 = self._apply_8bit_dequant(layer)
else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ad547dd409822..149e4419c64a4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -511,7 +511,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -532,16 +532,17 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
- )
assert activation == "silu", "Only SiLU activation is supported."
if (
self.allow_flashinfer
and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
):
+ if enable_eplb:
+ raise NotImplementedError(
+ "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
+ )
+
return flashinfer_trtllm_fp4_moe(
layer=layer,
x=x,
@@ -554,19 +555,9 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
e_score_correction_bias=e_score_correction_bias,
)
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
if self.use_marlin:
@@ -1109,7 +1100,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -1130,31 +1121,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- assert expert_load_view is not None
- assert logical_to_physical_map is not None
- assert logical_replica_count is not None
- assert isinstance(layer, FusedMoE)
-
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
- num_fused_shared_experts=layer.num_fused_shared_experts,
- enable_eplb=enable_eplb,
- expert_map=expert_map,
- expert_load_view=expert_load_view,
- logical_to_physical_map=logical_to_physical_map,
- logical_replica_count=logical_replica_count,
)
per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@@ -1377,7 +1346,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -1398,26 +1367,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
- )
-
from vllm.model_executor.layers.fused_moe import fused_experts
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
return fused_experts(
@@ -1738,7 +1692,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -1759,26 +1713,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
- )
-
assert activation == "silu", f"{activation} not supported for Marlin MoE."
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
return fused_marlin_moe(
@@ -2001,7 +1940,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -2022,43 +1961,11 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- if expert_load_view is None:
- raise ValueError("enable_eplb=True requiere expert_load_view != None")
- if logical_to_physical_map is None:
- raise ValueError(
- "enable_eplb=True requiere logical_to_physical_map != None"
- )
- if logical_replica_count is None:
- raise ValueError(
- "enable_eplb=True requiere logical_replica_count != None"
- )
- if not isinstance(layer, FusedMoE):
- raise TypeError(
- "EPLB is only supported when `layer` is a instance of FusedMoE."
- )
-
from vllm.model_executor.layers.fused_moe import fused_experts
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
- num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0),
- enable_eplb=enable_eplb,
- expert_map=expert_map,
- expert_load_view=expert_load_view,
- logical_to_physical_map=logical_to_physical_map,
- logical_replica_count=logical_replica_count,
)
return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 5241f9a2301be..7ebe40ec84687 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -137,7 +137,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -158,26 +158,11 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `ExpertsInt8MoEMethod` yet."
- )
-
from vllm.model_executor.layers.fused_moe import fused_experts
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 91bd45bf879cb..9e2718057038d 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1140,7 +1140,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -1216,31 +1216,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
apply_router_weight_on_input=apply_router_weight_on_input,
)
- zero_expert_num = getattr(layer, "zero_expert_num", 0)
- zero_expert_type = getattr(layer, "zero_expert_type", None)
-
- select_result = FusedMoE.select_experts(
+ select_result = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
- enable_eplb=enable_eplb,
- expert_map=expert_map,
- expert_load_view=expert_load_view,
- logical_to_physical_map=logical_to_physical_map,
- logical_replica_count=logical_replica_count,
- global_num_experts=global_num_experts,
- zero_expert_num=zero_expert_num,
- zero_expert_type=zero_expert_type,
- num_fused_shared_experts=layer.num_fused_shared_experts,
)
topk_weights, topk_ids, zero_expert_result = select_result
@@ -1322,7 +1300,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
self.allow_cutlass_block_scaled_grouped_gemm
),
)
- if zero_expert_num != 0 and zero_expert_type is not None:
+
+ if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 42d7a67371ae8..bcdfafb50fc5a 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -621,7 +621,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -642,9 +642,6 @@ class GGUFMoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
-
assert activation == "silu", "Only SiLU activation is supported."
if apply_router_weight_on_input:
raise NotImplementedError(
@@ -652,19 +649,9 @@ class GGUFMoEMethod(FusedMoEMethodBase):
"fused GGUF MoE method."
)
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
return fused_moe_gguf(
x,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 68a122fd46c6b..77b15db373a3a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -722,7 +722,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -743,26 +743,11 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `GPTQMarlinMoEMethod` yet."
- )
-
assert activation == "silu", "Only SiLU activation is supported."
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 01a23168bdde3..8165673135910 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -696,7 +696,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -717,12 +717,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `ModelOptFp8MoEMethod` yet."
- )
-
if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+ if layer.enable_eplb:
+ raise NotImplementedError(
+ "EPLB not supported for `ModelOptFp8MoEMethod` yet."
+ )
assert activation == "silu", (
f"Expected 'silu' activation but got {activation}"
)
@@ -740,19 +739,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
)
# Expert selection
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
@@ -1459,7 +1448,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -1480,16 +1469,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
- )
assert activation == "silu", "Only SiLU activation is supported."
if (
self.allow_flashinfer
and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
):
+ if enable_eplb:
+ raise NotImplementedError(
+ "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
+ )
return flashinfer_trtllm_fp4_moe(
layer=layer,
x=x,
@@ -1502,19 +1491,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
e_score_correction_bias=e_score_correction_bias,
)
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
if self.use_marlin:
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 2090c86f78dc8..cf348290a2716 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -359,7 +359,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -380,25 +380,12 @@ class MoeWNA16Method(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
-
from vllm.model_executor.layers.fused_moe import fused_experts
assert activation == "silu", "Only SiLU activation is supported."
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 66ae2e94c60a5..255b5aad17853 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -862,7 +862,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -887,18 +887,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
raise NotImplementedError("EPLB is not supported for mxfp4")
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
)
return fused_marlin_moe(
@@ -989,17 +980,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
):
from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- e_score_correction_bias=e_score_correction_bias,
)
# Backend-specific preparation
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 30772c3665b06..8be0299eaa66f 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -334,7 +334,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -355,24 +355,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
- )
-
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
if self.rocm_aiter_moe_enabled:
@@ -609,7 +594,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -630,24 +615,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError(
- "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
- )
-
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
if not self.emulate:
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 52656263a601b..7b51b828009fc 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -356,7 +356,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
def apply(
self,
- layer: torch.nn.Module,
+ layer: FusedMoE,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -377,22 +377,9 @@ class RTNMoEMethod(FusedMoEMethodBase):
logical_to_physical_map: torch.Tensor | None = None,
logical_replica_count: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
- if enable_eplb:
- raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
-
- topk_weights, topk_ids, _ = FusedMoE.select_experts(
+ topk_weights, topk_ids, _ = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
- use_grouped_topk=use_grouped_topk,
- top_k=top_k,
- renormalize=renormalize,
- topk_group=topk_group,
- num_expert_group=num_expert_group,
- custom_routing_function=custom_routing_function,
- scoring_func=scoring_func,
- routed_scaling_factor=routed_scaling_factor,
- e_score_correction_bias=e_score_correction_bias,
- indices_type=self.topk_indices_dtype,
)
return fused_marlin_moe(
From 839c6b7b72bcc7197443019aae32be409f1c0363 Mon Sep 17 00:00:00 2001
From: Chenheli Hua
Date: Mon, 24 Nov 2025 11:24:37 -0800
Subject: [PATCH 017/134] [Multimodal][Qwen3 Omni] Make Qwen3 Omni work with
audio-in-video inputs in V1 engine. (#27721)
Signed-off-by: Chenheli Hua
Signed-off-by: Roger Wang
Co-authored-by: Roger Wang
---
.../qwen3_omni/only_thinker.py | 170 ++++++++++++++
tests/model_executor/test_qwen3_omni.py | 221 ++++++++++++++++++
.../models/qwen2_5_omni_thinker.py | 25 --
.../models/qwen3_omni_moe_thinker.py | 110 ++++++---
4 files changed, 467 insertions(+), 59 deletions(-)
create mode 100644 examples/offline_inference/qwen3_omni/only_thinker.py
create mode 100644 tests/model_executor/test_qwen3_omni.py
diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py
new file mode 100644
index 0000000000000..88a61ed694c2e
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+ inputs: dict
+ limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+ "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+ "Group, capable of perceiving auditory and visual inputs, as well as "
+ "generating text and speech."
+)
+
+
+def get_mixed_modalities_query() -> QueryResult:
+ question = (
+ "What is recited in the audio? "
+ "What is the content of this image? Why is this video funny?"
+ )
+ prompt = (
+ f"<|im_start|>system\n{default_system}<|im_end|>\n"
+ "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+ "<|vision_start|><|image_pad|><|vision_end|>"
+ "<|vision_start|><|video_pad|><|vision_end|>"
+ f"{question}<|im_end|>\n"
+ f"<|im_start|>assistant\n"
+ )
+ return QueryResult(
+ inputs={
+ "prompt": prompt,
+ "multi_modal_data": {
+ "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+ "image": convert_image_mode(
+ ImageAsset("cherry_blossom").pil_image, "RGB"
+ ),
+ "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+ },
+ },
+ limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
+ )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+ question = (
+ "Describe the content of the video in details, then convert what the "
+ "baby say into text."
+ )
+ prompt = (
+ f"<|im_start|>system\n{default_system}<|im_end|>\n"
+ "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
+ f"{question}<|im_end|>\n"
+ f"<|im_start|>assistant\n"
+ )
+ asset = VideoAsset(name="baby_reading", num_frames=16)
+ audio = asset.get_audio(sampling_rate=16000)
+ return QueryResult(
+ inputs={
+ "prompt": prompt,
+ "multi_modal_data": {
+ "video": asset.np_ndarrays,
+ "audio": audio,
+ },
+ "mm_processor_kwargs": {
+ "use_audio_in_video": True,
+ },
+ },
+ limit_mm_per_prompt={"audio": 1, "video": 1},
+ )
+
+
+def get_multi_audios_query() -> QueryResult:
+ question = "Are these two audio clips the same?"
+ prompt = (
+ f"<|im_start|>system\n{default_system}<|im_end|>\n"
+ "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+ "<|audio_start|><|audio_pad|><|audio_end|>"
+ f"{question}<|im_end|>\n"
+ f"<|im_start|>assistant\n"
+ )
+ return QueryResult(
+ inputs={
+ "prompt": prompt,
+ "multi_modal_data": {
+ "audio": [
+ AudioAsset("winning_call").audio_and_sample_rate,
+ AudioAsset("mary_had_lamb").audio_and_sample_rate,
+ ],
+ },
+ },
+ limit_mm_per_prompt={
+ "audio": 2,
+ },
+ )
+
+
+query_map = {
+ "mixed_modalities": get_mixed_modalities_query,
+ "use_audio_in_video": get_use_audio_in_video_query,
+ "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+ model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+ query_result = query_map[args.query_type]()
+
+ llm = LLM(
+ model=model_name,
+ max_model_len=12800,
+ max_num_seqs=5,
+ limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+ seed=args.seed,
+ )
+
+ # We set temperature to 0.2 so that outputs can be different
+ # even when all prompts are identical when running batch inference.
+ sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
+
+ outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
+
+ for o in outputs:
+ generated_text = o.outputs[0].text
+ print(generated_text)
+
+
+def parse_args():
+ parser = FlexibleArgumentParser(
+ description="Demo on using vLLM for offline inference with "
+ "audio language models"
+ )
+ parser.add_argument(
+ "--query-type",
+ "-q",
+ type=str,
+ default="mixed_modalities",
+ choices=query_map.keys(),
+ help="Query type.",
+ )
+ parser.add_argument(
+ "--seed",
+ type=int,
+ default=None,
+ help="Set the seed when initializing `vllm.LLM`.",
+ )
+
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ main(args)
diff --git a/tests/model_executor/test_qwen3_omni.py b/tests/model_executor/test_qwen3_omni.py
new file mode 100644
index 0000000000000..c92c61dcd3bc2
--- /dev/null
+++ b/tests/model_executor/test_qwen3_omni.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock
+
+import pytest
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import InputProcessingContext
+
+
+# Helper function to print input IDs with coalesced audio/video tokens.
+def print_input_ids(input_ids):
+ """
+ Print input IDs, compressing consecutive special tokens.
+ - 151675: <|audio_pad|>
+ - 151656: <|video_pad|>
+ """
+ if not input_ids:
+ print("[]")
+ return
+
+ result = []
+ i = 0
+
+ while i < len(input_ids):
+ current_id = input_ids[i]
+
+ # Check if it's a special token that should be compressed
+ if current_id in [151675, 151656]:
+ # Count consecutive occurrences
+ count = 1
+ while i + count < len(input_ids) and input_ids[i + count] == current_id:
+ count += 1
+
+ # Add compressed representation
+ token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>"
+ result.append(f"{token_name} * {count}")
+ i += count
+ else:
+ # Regular token, just add it
+ result.append(str(current_id))
+ i += 1
+
+ print(", ".join(result))
+
+
+@pytest.fixture
+def mock_qwen3_omni_config():
+ """Create a mock Qwen3OmniMoeThinker config."""
+ config = Mock(spec=PretrainedConfig)
+ # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+ config.audio_token_id = 151675 # <|audio_pad|>
+ config.video_token_id = 151656 # <|video_pad|>
+ config.image_token_id = 151655 # <|image_pad|>
+ config.audio_start_token_id = 151669 # <|audio_start|>
+ config.audio_end_token_id = 151670 # <|audio_end|>
+ config.vision_start_token_id = 151652 # <|vision_start|>
+ config.position_id_per_seconds = 12.5
+
+ # Vision config
+ vision_config = Mock()
+ vision_config.spatial_merge_size = 2
+ config.vision_config = vision_config
+
+ return config
+
+
+@pytest.fixture
+def mock_processor():
+ """Create a mock HF processor."""
+ from transformers.models.whisper import WhisperFeatureExtractor
+
+ processor = Mock()
+ processor.audio_token = "<|audio_pad|>"
+ processor.image_token = "<|image_pad|>"
+ processor.video_token = "<|video_pad|>"
+
+ # Create a real WhisperFeatureExtractor instance for the feature_extractor attribute
+ feature_extractor = WhisperFeatureExtractor()
+ processor.feature_extractor = feature_extractor
+
+ return processor
+
+
+@pytest.fixture
+def mock_tokenizer():
+ """Create a mock tokenizer."""
+ tokenizer = Mock()
+ # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+ tokenizer.get_vocab = Mock(
+ return_value={
+ "<|audio_pad|>": 151675,
+ "<|video_pad|>": 151656,
+ "<|image_pad|>": 151655,
+ "<|audio_start|>": 151669,
+ "<|audio_end|>": 151670,
+ "<|vision_start|>": 151652,
+ "<|vision_end|>": 151653,
+ }
+ )
+ tokenizer.encode = Mock(
+ side_effect=lambda x: {
+ "<|vision_start|>": [151652],
+ "<|vision_end|>": [151653],
+ "<|audio_start|>": [151669],
+ "<|audio_end|>": [151670],
+ "<|audio_pad|>": [151675],
+ "<|image_pad|>": [151655],
+ "<|video_pad|>": [151656],
+ }.get(x, [0])
+ )
+ tokenizer.vision_bos_token = "<|vision_start|>"
+ tokenizer.vision_eos_token = "<|vision_end|>"
+ tokenizer.audio_bos_token = "<|audio_start|>"
+ tokenizer.audio_eos_token = "<|audio_end|>"
+ return tokenizer
+
+
+@pytest.fixture
+def mock_image_processor():
+ """Create a mock image processor."""
+ image_processor = Mock()
+ image_processor.merge_size = 2
+ return image_processor
+
+
+def test_qwen3_omni_get_updates_use_audio_in_video(
+ mock_qwen3_omni_config,
+ mock_processor,
+ mock_tokenizer,
+ mock_image_processor,
+):
+ """Test the get_updates_use_audio_in_video method directly."""
+
+ from vllm.model_executor.models.qwen3_omni_moe_thinker import (
+ Qwen3OmniMoeThinkerMultiModalProcessor,
+ Qwen3OmniMoeThinkerProcessingInfo,
+ )
+
+ # Create a mock context
+ mock_ctx = Mock(spec=InputProcessingContext)
+
+ # Create processing info
+ info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx)
+ info.get_hf_config = Mock(return_value=mock_qwen3_omni_config)
+ info.get_hf_processor = Mock(return_value=mock_processor)
+ info.get_tokenizer = Mock(return_value=mock_tokenizer)
+ info.get_image_processor = Mock(return_value=mock_image_processor)
+
+ # Create a mock dummy_inputs builder
+ mock_dummy_inputs = Mock()
+
+ # Create the processor
+ processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs)
+
+ # Test parameters from reference video
+ # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4
+ audio_len = 85
+ video_grid_thw = [6, 36, 64]
+ video_second_per_grid_t = 2.0
+
+ # Call the method
+ updates = processor.get_updates_use_audio_in_video(
+ thinker_config=mock_qwen3_omni_config,
+ audio_len=audio_len,
+ video_grid_thw=video_grid_thw,
+ video_second_per_grid_t=video_second_per_grid_t,
+ )
+
+ # Updated input ids should align with HF implementation.
+ # 151669,
+ # <|video_pad|> * 576, <|audio_pad|> * 25,
+ # <|video_pad|> * 576, <|audio_pad|> * 25,
+ # <|video_pad|> * 576, <|audio_pad|> * 25,
+ # <|video_pad|> * 576, <|audio_pad|> * 10,
+ # <|video_pad|> * 1152,
+ # 151670
+ print_input_ids(updates)
+
+ # Verify structure
+ assert isinstance(updates, list)
+ assert len(updates) > 0
+
+ # Verify start and end tokens
+ audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id
+ audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id
+
+ assert updates[0] == audio_start_token_id
+ assert updates[-1] == audio_end_token_id
+
+ # Verify both audio and video tokens are present
+ audio_token_id = mock_qwen3_omni_config.audio_token_id
+ video_token_id = mock_qwen3_omni_config.video_token_id
+
+ audio_count = updates.count(audio_token_id)
+ video_count = updates.count(video_token_id)
+
+ assert audio_count == audio_len, (
+ f"Expected {audio_len} audio tokens, got {audio_count}"
+ )
+
+ # Calculate expected video token count
+ spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size
+ height = video_grid_thw[1] // spatial_merge_size
+ width = video_grid_thw[2] // spatial_merge_size
+ expected_video_count = video_grid_thw[0] * height * width
+
+ assert video_count == expected_video_count, (
+ f"Expected {expected_video_count} video tokens, got {video_count}"
+ )
+
+ # Total tokens should be: 1 (start) + audio_len + video_count + 1 (end)
+ expected_total = 1 + audio_len + expected_video_count + 1
+ assert len(updates) == expected_total, (
+ f"Expected {expected_total} total tokens, got {len(updates)}"
+ )
+
+
+if __name__ == "__main__":
+ pytest.main([__file__, "-v"])
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 262ea771d9cdf..7506ee8656fda 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -23,7 +23,6 @@
"""Inference-only Qwen2.5-Omni model (thinker part)."""
from collections.abc import Callable, Iterable, Mapping, Sequence
-from copy import copy
from functools import partial
from typing import Annotated, Any, Literal
@@ -387,15 +386,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
- use_audio_in_video = False
- if "video" in mm_kwargs:
- video_items = [item for item in mm_kwargs["video"] if item is not None]
- # only check video items (if there are any)
- if video_items:
- use_audio_in_video = all(
- item["use_audio_in_video"].data for item in video_items
- )
-
if is_update_applied:
mm_placeholders = self._find_mm_placeholders(
prompt_ids,
@@ -404,7 +394,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
self._validate_mm_placeholders(
mm_placeholders,
mm_item_counts,
- use_audio_in_video=use_audio_in_video,
)
else:
prompt_ids, mm_placeholders = self._apply_prompt_updates(
@@ -414,7 +403,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
self._validate_mm_placeholders(
mm_placeholders,
mm_item_counts,
- use_audio_in_video=use_audio_in_video,
)
return prompt_ids, mm_placeholders
@@ -640,19 +628,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
return mm_processed_data
- def _validate_mm_placeholders(
- self,
- mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
- mm_item_counts: Mapping[str, int],
- use_audio_in_video: bool = False,
- ) -> None:
- if use_audio_in_video:
- mm_item_counts = copy(mm_item_counts)
- if "video" in mm_item_counts:
- assert "audio" in mm_item_counts
- mm_item_counts["audio"] -= mm_item_counts["video"]
- super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
-
class Qwen2_5OmniConditionalGenerationMixin:
def _parse_and_validate_audio_input(
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 61f218f16d79c..f5f88f66eff91 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -68,11 +68,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import (
- BaseMultiModalProcessor,
MultiModalPromptUpdates,
PlaceholderFeaturesInfo,
PromptReplacement,
PromptUpdate,
+ PromptUpdateDetails,
)
from vllm.sequence import IntermediateTensors
@@ -87,7 +87,6 @@ from .qwen2_5_omni_thinker import (
Qwen2_5OmniConditionalGenerationMixin,
Qwen2_5OmniThinkerDummyInputsBuilder,
Qwen2_5OmniThinkerMultiModalProcessor,
- Qwen2_5OmniThinkerProcessingInfo,
)
from .qwen2_5_vl import (
Qwen2_5_VisionAttention,
@@ -807,24 +806,8 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
else:
use_audio_in_video = False
- if use_audio_in_video and "video" in mm_item_counts:
- assert "audio" in mm_item_counts
- mm_item_counts["audio"] -= mm_item_counts["video"]
-
- # Special case with `use_audio_in_video=True`
- if use_audio_in_video:
- if is_update_applied:
- prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
- (
- prompt_ids,
- mm_placeholders,
- ) = self._apply_prompt_updates(
- prompt_ids,
- mm_prompt_updates,
- )
- self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
# normal case with `use_audio_in_video=False`
- elif is_update_applied:
+ if is_update_applied:
mm_placeholders = self._find_mm_placeholders(
prompt_ids,
mm_prompt_updates,
@@ -834,10 +817,24 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
mm_item_counts,
)
else:
- prompt_ids, mm_placeholders = self._apply_prompt_updates(
- prompt_ids,
- mm_prompt_updates,
- )
+ if use_audio_in_video and "audio" in mm_prompt_updates:
+ filtered_updates = {
+ k: v for k, v in mm_prompt_updates.items() if k != "audio"
+ }
+ prompt_ids, mm_placeholders = self._apply_prompt_updates(
+ prompt_ids,
+ filtered_updates,
+ )
+ # Derive audio placeholders from video placeholders
+ mm_placeholders = self._derive_audio_from_video_placeholders(
+ mm_placeholders, mm_prompt_updates
+ )
+ else:
+ prompt_ids, mm_placeholders = self._apply_prompt_updates(
+ prompt_ids,
+ mm_prompt_updates,
+ )
+
self._validate_mm_placeholders(
mm_placeholders,
mm_item_counts,
@@ -962,7 +959,9 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
def get_replacement_qwen2_use_audio_in_video(item_idx: int):
nonlocal audio_in_video_item_idx
- audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
+ audio_num_features = audio_output_lengths[
+ audio_in_video_item_idx + item_idx
+ ]
video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
audio_in_video_item_idx += 1
@@ -971,14 +970,17 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
if second_per_grid_ts:
video_second_per_grid_t = second_per_grid_ts[item_idx]
else:
- video_second_per_grid_t = 1.0
+ video_second_per_grid_t = 2.0
- return self.get_updates_use_audio_in_video(
+ placeholder = self.get_updates_use_audio_in_video(
thinker_config=thinker_config,
audio_len=audio_num_features,
video_grid_thw=video_grid_thw,
video_second_per_grid_t=video_second_per_grid_t,
)
+ return PromptUpdateDetails.select_token_id(
+ placeholder, embed_token_id=video_token_id
+ )
video_replacement_fn = (
get_replacement_qwen2_use_audio_in_video
@@ -1004,14 +1006,50 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
),
]
- def _validate_mm_placeholders(
+ def _derive_audio_from_video_placeholders(
self,
- mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
- mm_item_counts: Mapping[str, int],
- ) -> None:
- BaseMultiModalProcessor[
- Qwen2_5OmniThinkerProcessingInfo
- ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
+ placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+ mm_prompt_updates: MultiModalPromptUpdates,
+ ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+ """
+ Helper to derive audio placeholders from video placeholders when
+ use_audio_in_video=True.
+ """
+ if "video" not in placeholders:
+ return placeholders
+
+ # Validate audio and video counts match
+ num_videos = len(placeholders["video"])
+ num_audios = len(mm_prompt_updates.get("audio", []))
+ if num_audios != num_videos:
+ raise ValueError(
+ f"use_audio_in_video requires equal number of audio and video items, "
+ f"got {num_audios=}, {num_videos=}"
+ )
+
+ tokenizer = self.info.get_tokenizer()
+ processor = self.info.get_hf_processor()
+ audio_token_id = tokenizer.get_vocab()[processor.audio_token]
+
+ result_placeholders = dict(placeholders)
+ audio_placeholders = []
+
+ # Each video is paired with one audio
+ for video_idx, video_placeholder in enumerate(placeholders["video"]):
+ # Create is_embed mask selecting only audio tokens
+ audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id
+
+ audio_placeholder = PlaceholderFeaturesInfo(
+ modality="audio",
+ item_idx=video_idx,
+ start_idx=video_placeholder.start_idx,
+ tokens=video_placeholder.tokens,
+ is_embed=audio_is_embed,
+ )
+ audio_placeholders.append(audio_placeholder)
+
+ result_placeholders["audio"] = audio_placeholders
+ return result_placeholders
def _get_raw_input_ids(
self,
@@ -1454,7 +1492,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
)
if not len(second_per_grid_ts) and len(video_grid_thw):
- second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
+ second_per_grid_ts = 2.0
+ second_per_grids = (
+ torch.ones(len(video_grid_thw), dtype=torch.float32)
+ * second_per_grid_ts
+ )
else:
second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
From 97588c4d1231287eb380cf9fb95ec77f88479b85 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon
Date: Mon, 24 Nov 2025 11:28:56 -0800
Subject: [PATCH 018/134] [Model Runner V2] Add minor clarification comments
for Eagle (#29332)
Signed-off-by: Woosuk Kwon
---
vllm/v1/worker/gpu/spec_decode/eagle.py | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
index 0f11903e14540..59d0f313d96a2 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -65,6 +65,12 @@ class EagleSpeculator:
# [num_reqs]
next_prefill_tokens: torch.Tensor,
) -> torch.Tensor:
+ # NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the
+ # number of rejected tokens, we maintain the size of eagle's input_ids and
+ # hidden_states the same as the target model's. This means, we pad each
+ # request's query length to include any rejected positions. By doing so,
+ # we can also reuse the attention metadata (e.g., query_start_loc,
+ # seq_lens) of the target model.
if aux_hidden_states:
assert self.method == "eagle3"
hidden_states = self.model.combine_hidden_states(
@@ -110,6 +116,11 @@ class EagleSpeculator:
# NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
# used for draft and target sampling.
pos = input_batch.positions[last_token_indices] + 1
+ # NOTE(woosuk): For draft sampling, we only consider the temperature
+ # and ignore the other sampling parameters such as top_k and top_p,
+ # for simplicity and performance.
+ # While this may slightly degrade the acceptance rate, it does not
+ # affect the output distribution after rejection sampling.
draft_tokens = gumbel_sample(
logits, temperature, seed, pos, apply_temperature=True
)
From 4d6afcaddccaf281385ddfa7c6078916af7d9d20 Mon Sep 17 00:00:00 2001
From: Benjamin Bartels
Date: Mon, 24 Nov 2025 19:40:54 +0000
Subject: [PATCH 019/134] [CI/Build] Moves to cuda-base runtime image while
retaining minimal JIT dependencies (#29270)
Signed-off-by: bbartels
Signed-off-by: Benjamin Bartels
---
docker/Dockerfile | 16 ++++++++++++++--
.../dockerfile-stages-dependency.png | Bin 134558 -> 149377 bytes
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1b937bbc1225e..e03b9989a190c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
# glibc version is baked into the distro, and binaries built with one glibc
# version are not backwards compatible with OSes that use an earlier version.
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
-ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
@@ -328,6 +328,18 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
+# Install CUDA development tools and build essentials for runtime JIT compilation
+# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
+RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
+ apt-get update -y && \
+ apt-get install -y --no-install-recommends \
+ cuda-nvcc-${CUDA_VERSION_DASH} \
+ cuda-cudart-${CUDA_VERSION_DASH} \
+ cuda-nvrtc-${CUDA_VERSION_DASH} \
+ cuda-cuobjdump-${CUDA_VERSION_DASH} \
+ libcublas-${CUDA_VERSION_DASH} && \
+ rm -rf /var/lib/apt/lists/*
+
ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index 57a33524a5169c8b56c7de309cdeb243ab3fa918..b327eb2151f50e4d682fe533fa57e12f0b6118c2 100644
GIT binary patch
literal 149377
zcmbTecU;fw|3Ch;j&sa|j8Ji;5(@2|nHMb$?LjJrx}7;?94
zxw1T@6lw+Uq3jsCY|ri$vUCZkh0n%sz#4ZjW^eY2M8X~Te#5V
z<&G7H4)5O^zDnxQdFoKGWJK}AR`U*)+Eq)cmCtJ#$Jb8?O-v3ujppq5R^PV6yP(Aa
zW0~`%>UTtiAz;o2_@nN@kk^0w@=@FhjtldD`6Oz=G5=papgLVwv;Xo*j&Sh*%Vi|>
zOXvN(jw^TT)JS8=L~nM`)urnVKL2y|aNfML@|>KUA3|mPo}VjY>#j?YE4aV=mit1=
zWZ^x*pT8T-te$M5#(eS9C28M%+sne_N^Udq>@)wE@mO!3$)4mov-GaC&X^k0l!kq0
z-g++EuxHDijeMNK>!`R(vlCUib&_vTnEf;LR%Q_7B_hm>o>qg<2mx
zH>xla3tLF>Z<)IY^7m{TodopW+|G}6nd;o7k!<3fmt1QUxoi{vu2C(!6P;BVLJD>OaYl$g)v$8c{obv9@2
zoa#D7#>5nF_tB(V77MZ8TF)J$W&R3f*1>ACZ?o3Oi
z7gM}jp6qm!{keG5rdu+9>k#P#e8f4Lr4
zkuMMXg5Q>f?>>M3*i*6uzWk;SF|q}FEt=nok~tZCf6$Ay#X<QGFxj5IY^P#uv9shV8tndWSFe%=zX5zXJ0Z@gzW(7K(LX4OvrV+~u#
zaUZTl>`XR3TJxiQ)%>h3w~>sa~W3@w&_yE{VGGXY-+Zz6dROJZm_9i&BuB4$g-{2m-n1PZIZDMs~vvI
zqHu2>gZ~mLT<(`C-#u34GF3G>R^lStKjS85T9>?Aa4psOVTDMSr$bLFyPsLwIeN3p
zbeoGs?pl@E3vmB>fzhCuu^>&X9F5*urnTm#?d3PLM``L$#V{5_BwGZ
zD=RMwn@_og&YcEptNIIh`pB#_daujOgiuHL6Sw6HCAX5GL12i{`u|zV-8?x^!Y#Op
zl7Iex!ee7Z!%)sG2m3MahCW(0G$48
zTJf{YQpG)&@Xe!
zCpu8bla%K)KDcAg9~g;5mQM9cMv9;7mqptc=rmkY+cdi|7P?-0EakzRl!u>)yKv%{
zncCwp+{H>mibjf4IKPqFcsaoKW9slIb`=*c|G%?@nu&xbB
zwM_sj>mDqmR2zRC>SU&4S(rx*P7A4O!BM|$KLUXK;r|zknNZ`y
z-F^zs?%|*PSAJW|WEddQ{k+Ub#~ia!3NZU=&D;nf*ZhA(fJ*C_$Z!Am
zL0Q?mh`(-^R=Q1*O7?gKsYHx>bR~~H{UGdUO;K^v_ujQwq<;PL`zh}<`Z#pf!c)JQ`0
zM_xZy9$spw?a=5n(lXIFS5rWR2A$NmSXNKT!f9&2m!JhVC0HbD^83q)k>N7W1sg2<(I!=5*8~6u%>zP)gdG?f1
z)l6@!T;0`y@y^(i8p9whhv|W!8ByE%3sMC35v;LW{rS<}kD2&*EGQKhS5Q;)WhEG@
zShzDvu4t5drk`6a{`6t`nZgHqQ@auhK4iPhJU43)2J(JKHqYksIrB)-l@)FeDRIoK
z(s6O>?i3sMciRX?)Hd4W5i7f6fb7*;97`V`4iZYFd9l?v
z^k!zCHKC~R5zbpHzK=3a9JRM4I_2XrdKBs$kh3h^5jT*nOE&G~b?9%--{Dh0qa&|}
z;2P(uVziQL4E)lrf0zFwj{y&i%`%57E1VednA|@fbLhihd)Ywt%tZFs*C!r?
zb0{e*>(?acr+&J;^aFC0a8Y+-_Do)-d1}jj#{s{_sN>CD*)!7!s1{CR0?fI9e1(g|
zP!#(C*h5f)&ec9Y_fWt*kV`AA*(ZBaKcE#zk~JT(t30ttfc7C)CmTRubg5X1VM$=p
z_F^D2QLOJ+hf22k#3-&uFAflCLx9rg^;Tsv1kYNu`WhOHe*63^YVfWNMe&mT`1OcHaG0vl}SSru;ELbV(StchjrPnDjmM_<#^u+|bmOVyA
zMJ2Ycuq1(B+;b9<+k)Uw*N--D%(4!g=+v1J&YtQvFN;u)Bq5DNEfaY%x}RmHkA>|2
zfVKr-`LB?ePYU(cCUvfNHoLxp>7hHA5Ng0wiF38N~ha&e?WTW&bYD?A#vOGqn)Am0Er>&`=O0R_7mMH63N3!
zu}-@^MEj3A4?bbh?=3H1n|1F)#$Fi1){E7IaMN8L(XmW)qaYpym
zzr3=z4ZJ)OxWTlJU*;h%&4){?W3@nlh;1=~{A3toP%H3|4!b40T%e{y#>KubQL6EO
z?K~M3K2V?CdwIY^J;iM5S-Q9L2~Z&OEG%PHU!zTo|jVXEfr#|U&$KsvLSjQ)4GHw#%rx6G$p
zAOBE9a*lj?UM5>WB0_B_Gk4KwGRu7D5`lJh)h-9cvttmR4IFhCm{jm
z0nHTf*Wn*2wBs3WM|NC30^p%LlZXN6UQ}7e?v(m|J!7z-=(0jq5`C6mA6>7sTakON
zF1ee%Cp<0GrbgdqvqM+BuYkGsN4vHF!Xe3@3Hb|4EfF1%&g%D$B?8A=Jh~sUpD;;V
z?!p}$B98MZNv7w*
zCkwLAG{3!TJk+}rv+xtJN{S`owuXo?wS+UY?poWdfdFhD59lPzSj0s&Op}y}(gOS$
zQT$YRl=aGQ%ijg>zfw6~kLn`E$H!;a8J*G(0NoOw#~hg%L{zM()Q_1Cu@=)Ek+CsV
z69Pu%99W<8*OqU(AM4a@Y&TdctimU0
zy1>VaPl_$2K-eK3aXmS5XLic~%{)b#w+>v|c^t$%fjmIL?iA@~N)cL(27aa;ytXax
zyrBWq+Si!=PEj3C#GS@NtUD`{+K|nxjEs!3z72rozK7O=e=;Wf3nE2rh-RV0sUPGt
zJXSA+GEfABko~R}A=ZD@k^+%F^Mi_qKhZC^-*R=m_FJN+U^x0YuJf!PreKhvp`mQp
za4}O!5LsfMK7W6HIT4VqEk8~tn~msLo6Ixr0)}PDTF#7DWrxZIiPPwamVICr$5YH2
z`sHQ@<=U%abpk9GP&wlt!cRQmbN;x}`P&suq$C<0*@hjlu1jQx5~rTaiC9y9rEoOdzA&4VYNCU-LRG+%u
z0yQskpo>I*qCwHSzyDa%Pm(!6r9FUA);fh(*PdX~+v`&;hmoRr0sLeWwK1aFCcw5<
zAnZcEDoro^xss0!F`z=2dw1W5r^h9RM`Z*Wl+yKDG2+a3dSQX~Mnqe5nC_jK?$!Ak
z2!^G7f0vqqNK;v%p8)qt(Qi3z#eQNA!<=pD-N|)f+7lkJDQRiK$W*E=eW=Hg!$W}<
z?{)+^n4`Y0|4{~Xmt!>x{e^e{RAgmjWV*+(JW9Hpx-us5x$6k3OoUMK^x*XsNu2<^
zyZ5ZcII~B~J->I2(
zk`kRm^Rb``YF6I^{cN}m!~-x>ks8QdiTH1rMv7T`O`>6GylX`{iv3_`RqQZ9M|SN`
zeL6J1_>0(@xr+@~4}LA(h6$4PU>3s+`|ui-hsWHV2cB_fv(P>jZ99E>GuF45+qv%+
zNim>Ug)c5N86AHJne1p=ae&0|Brvfk(o<1;SWqCuMG{=Igb)z3T-uy~K0
z*pO08T1Uj>^hmZG-3me~vg1J8bkqa@%61H*kkQcp?xTa(4UnW|ut$kVOaz0}Z~rVE
zfathX<}5(SeQupV?M!=ep~HEX)Pd3$Gg-;^fX&NghN9%y
zAya=qRg=qXMRCZdu?O7Co*s$~awxg7LYM=y+rpV%JM(1i&Xdh4*;A%mXr}sceM!VSuT-
z)Sq*p+~-4<^dTT%uuQJxq=xBGswIKGB0Vp*HRXAfls{AC!OHAXOJI#wPkJ|8XC4&m
zJQO(wEHZJ`?Y@fuc62@ThcWOE|7<*vU2o<{g5y~KbCa5VbtZ{2gyRxzyvAi_O2v5b
z1hPR3QKvzw?vkWLOUPZPamGVz_$`?+=c#e0&uVd8b~BJY=szDc3z5=_Wz^D-hRo~B
zV<@}-rt6(tq**#6k3sqLLx?nikZ?iVL%yCxmE-tK#j*W6Usn31-<=g*tvch_OhR8#
zYr<@MKenUp+0>p3gFUh75%=A5j*BbPe&`r3G|$H!@Alxkzj@`;0IjC>LEUUb{&`I0}RP`
z?yS+%WC}nD(=c^$r2w*rNw6bBtYcc_`Ws}#wQ5<8cJwpwVRl1vi4*@pXXMS7Z5E76
zew~U%3~`y6t^wsOL+!cjHLsinRl(8*7-ci)Dvb`%c?MtB|lwPlR-&e&}61&dcm1DCqFFL=KNIu1X9cmw`7*RUIaIbqhF
zD&L7x)t*yMi@U(wpU#3?NLQ}oW_XySJQo+1c@+POwTBmfPYSQT;r3yuyRtmceIIP|
z5LLLeCYT~lP8GH-KoXbJ#BdQwQu$u&>zS5P9Zw0CqLJ9nKrLGoHZOSV#m`sQ8+pu3
zi_fP_Hvgz7s@s4lOPce&EZ;C1h9JrawItDrD2w}z2y@zhustbH%zlt#Rti)8q2st-GXyuNfkKdQCZDcc83qLuA-(?I
z5?8(o$OiT$7g<(=5&*~
zQD6u|H7eN&-!D=n*0KG#4kwDAam0nDH?y<~(2SLAcZ->+k)5pza>7R6sbncQOwN+q
z|D@cG^p1`W;sw$Ox3%i7O@c_9Gp_WD>iYhhW2Z*rJ2pQPI;jZngZI|}MiY0Diu(w~
zg3AsH4GAk0`Vya7>f5vF!7vGZ)Ej9!p~!^*LJH^qqIAVJ{&YSnZ>;p2_swqIE>%yR
zsSXuJJ|m@n^)@9dR!Y^F=r(OZF5=KXeFZO^b{G6|gXpJncbG(yhcANpLb#>e{uu)xwMUX&@Jz}x-=6g}{K~8|K
zsqhuOSV_8_w|KKW8LE@wcNw($DE6iVNT4+#s#C*tv0nvYtCGr6+eB;zvXPJ6-f$N%
zBsxclinGC<+e7RxdNUx|+d^NSrLqepod#hBZbd}c(3|0A7NZht|Idwb08+GS;8>Qy
zBP10*6=`eW+pCqpQXh8+Z=^Q8IL?*ITBT_zbHshKuMB?;D=h&E`uyEh&
zH3O)x+i7GYEMpdZfo&>z_n7{S{eQJnp1UYeOoRYgFd++M^|efksgA2GQ(vW0zqsub
zPj-eZs}B`-=ayB}WDya$u-w~OoCZ(8bm4)2-vwU^W{Iv?I@0Gr3G87S9lD4pk72R&
z#BldgS@=_De}EfFF;b?9mkmN#jAW|Ff3f2bm-*GuvCdr
zx8#RWCm-YwzK}Zl<(7(CB(N}I!wX0WNtWseBLXp!fHXF}JHXQ8CQ33QGLi>f4YS!U
z0h?7HEOde#5_;L7bZVcQ;m0#JzQi05*(KkyyFkjh$_yqNjomc^UTO9H0Q
zS+V~v0>EE``1pccYBy0`FzkW1+dNol#BLxv6s+^$?4`~;j~)2oaq%0)W>4Fbx|o3^
z1(2Y-AfZHjqB+#hEMm7Qky%JaCj^jK(AU5&cmQe*O)n(^8vqB=v1z~U{Q-vnoVy9^
z^fe6h1@&%OEHSztlYIiUfd5Y-G_XlO@VviUgN?=}>;Hf_z##J6$a&U%cy8i35+9I0
ztE<2*3jz~Z>%R=H;E$?bNhg)gUoigb1}eu$75aZndQTcF4FTee8g&MQq5#uKx+l&H
zQ~V1FW+(T8?8ZhjD0=8kY;{D`F5EOS5=uArW+8&c2jrNImS)ra5GjBc%xM?EqrA=!
z2v%N-WwQj)3k3@l`sBKxKWD*82Btr#(aYF)WU8C7zC}mUfTALs}q<(#TtDkrWDC-4;PL#=tM1NJK
zOmwHS6zhnPRGWlu1F`Sz9kmDkq=pLU+mBoHq_%~UPKSN^$LJxCAO9sWGge}Wuzd=k
zwupTpJkwun^5CuNNCzZdXK984r#ERi@<;t3&qEqQ_>>hDz1)^GdaTSz8-Q-l&mQDx
zK}?(4BzRDJiTt<%Ju{@Yrfi-kD=AhLDmnqA4+g~~QVUWv2vzIDE}-^Bp$n*uo~6@I
zSN)4Hb(cdB{)=4jA-YPy?}ZV(CRp$A9C)Vwq$16#W<+Zfg8yh-MM40zb{CR~9vGHR
z3&MEG6{$?=@(sxwA>2n1nKN_|Eh5smRiS~?8!>D$}k
zS(FkIeAbm25M;8MT_PXO*7JAv;4J&l*+Y~%7&fyVJjZ^I`|JQPyAL~+Kp7$xA!V2N9!DP5Jh)he4E~QMi{Mck9
z#Srxxd;D-773@yX9g!P|Bn1Pv4A9rwxn#YL<6Z)q%(BK>Q7>!*8dth1L&@8X%ykC&
zl!a_#4%FE9-+pW(h&nQ4iN=RIeR`dG9fU^eEz5vL0Se=;5hsUa;|Q_X@bKBjF&K>y
zvv@HM89X2H9f@&41VALHhs08Xq;{?$&6>aFj2E#!0W)aiC(&_6q-atXx|8tp_{a2Q
zsx9O-QjjqH(t)BvWFUY;FS1;gQv4TDvW^nh6rC2gH|+fj7u_kmnulj`q_<9erxq!8k0kiEZAz)Tak
zPIrMkb{R;q22Mm!{mCbQlcU5)vWJnvmvusFtN0?!{OduO;p^xu4vbL0uw+E^^iP*xhT
zdjauV?`xO=0Rk5Az{@qttZxG@wFl8;1jwaam*4o_>*IQ-`1Q>-1e+)+jegX?6~!Bg
z_kxNOl{iMC;lW-#qE$pEI^dh~H-6dR-x!ey5-Lt=v_Krx?o32FZL7B^*iG6_s>Wfu
zQcDDCtKR)=Udu65g;0wJG*LZ`D`?t>+P$__%qIyME~P6cuC!!5s$U32xUR)9J}E
z!o%L&+2n&pw`Q_>#)-%{0g*9joWzK&s+GpAI@o6tzLR)LUJmHkkZ4E56VsXevi=<8
z0Ma5#LbaeeUXKTDNYZ}%yIU;mq8s%_8f;5|IQ^lj#17TqYUL7hGTBf$>9KsK7-j(c
zPMS2QC$eWmAkf+Z$n$RiEH5OJMv4;%A4~t6P3DeYtclHqOv__j86^r3ZEO__7ix{H
z;8qw#C&(V~sm6_Xe{_vXUICDWvF+wQkBTeC_{k0=@`|?eSc_w~u}%SWAb(P0NWK(i
zmb@}ZJ16G*6-x4G5#JJBhLaXKdfLBe&n?Tv~uj66HDe#LYy&e!@4&dZ)d?
zWCj0EZ*5jMB#pLvin95meNAO}V1QRD%92P~uw(qLGuv~}K0t^Tof<4V_c6l`;B2Ei
zUc&x5rLK@ldz1rJjC{$etiSTp>gzewq+KWUOPa=$+o7HgV&bj-x2uu
zUcJjeiqA1b8$bp2!1`>t!2xIe_(InS|!K-Z_AbNUQ+>S}}ZR
zZi7mXTwHDp!cG)tk}B&_tHHab`)^j6BGnV{vk$$S%b0|DAy$sp*G!EMO&mzS0A-06
zJWiH)x5yPy!^Rkj?v*`h2|uz+P#Xy2+f`DV?~*j-Jp?bw<)qb;Xc5f^c{+!XHc!ipy@q;)9Uoa8YA)H!BmiCLI0>6=T4R0rsvzF+$ZZ-8^WZ)
zrb!8b_#DZ%iR@30AN$k3SEF(KB&jm2G(Zc>my6an!ydS+MVj^aEg&VRqm}d?VM7Rp
zBDb|W#chaaPFgw0&49Xfi76o5FaT6AR%p?BSBlve?0gprV^o#Cd#GQl+s+s=gYlI;pcRr=1m;d}80s7mq>^EcDxDl}B%QsRyhs3G
zr1{vXmU&FJL>~;gh$d1~AJT04`o=0oVXLzbdCw)t_5+2HCxt3&*R9@|@hn|TmVF(}wp)i3!M@A1A6#M&)U12w}fxW)N}pY^B8?>
z>ZONMb-7Hyy^+o|3oVhv5yz2_MQkFCHK}comy;$ZC(F~Ujrq>L&-r`VynBZP%DgCGkFi*ZU3P)7dgc!u+#m>Y^}zIjg8z;O1DKmPBnLsnK+`??Ek
zY-~m-h_&gqvDN`TK0Y!h!h(W=IF-WXJ=^DVyaAwj>goCC)oa&`01(qG+hiYZU}tAf
zGHXzO`SN8)T}rG`MTGu=|L!9u{By<0oh0`@w;wVZ`dWIBJ_r`|3`!Qc5hVF1P-_nE
z*s+7r%s5`PJb3>CT(MTY9Fu-rNJM14(m3T7ckdnpljz0w>Mv3ujaxNkWMq`5S{g3;
z?Y9UlDyK?}<~&1keua|E@;kEXKS7B69bXIwA$KdFth3Ulr>E7^ZK8*=!oxX?_qRbF
zc#a+~zCw0hJ$?L3{MK8+t_4S?mf-)bY9C@-uRN#>HRm?53c<+r=faL6J*T(kr5S`-bs#Q7HQEK6@x101XBa5T
zQ92O&69jR~#VFthW29+%2$D&JMaw%o`Zm{pc8iN^*QJ=N96fr&d9qJR>&5x44KIYP
zJHi2CmMmL#FFQM1Q%h^9pyKRwFM0ZFLC(#vX-?5`2{KTujrDwy7
zL!IM$t;aKanrmumGGM18C3b+LR(eQGheObN7=X4c$(yAiv^A+|X`EZPZml}(Z>RR>
zk{rkU{Ctw!4jep~bUJs*6darOH_8!8w0ZO9Em^WeK~j>k{1;9px{~{Ux
zaZ8Dd+rxM6!=d8hVgsmpl>8Ub(N9-MSITB4rnO8=y9ESNHs2x7k1=rW-Fu?QPe4Ug
zRh8Lr$(HQg*RKhJgn)4Kz<~n{jEuf!ny~N~Hf#w0^hr0#wC?1J6)P6`@0h(`?i=2@
zS>vrO8YuRN5M^z3B&LwulJ4cT7LiHRE~%y$FJ8dPI3jG(d;{{1z;;&Fw-pt99*K)Q
zt7L=3qlY@Hc4CUrVXk=k^y!Biq@<*h&XpZO0NcZ4uZTN!aLqnk9JTu5;~R5YGRZ^l
z)zQ)E>;|cCdvQZCP2*4d=C`Z1Z{OaI`f86O83TVErtCPjZ$G?vC9@LZluYbwWDFJe!RK$>x>bfY9TI;?
zv|3q?LJcnv$UFAzkte&ZI{t}&K*0C7KgRrju2<0cS8>M1rIJ0G={7Qe(I?KH4L=#J
z{`A3v2fTYR&t&A$SN(wK_HF%l^Bm*o7;VF)J?cB9X`r4_tr{kN??!wPm
zUcG)@Bi$4|i;CAyh1|SgYpa6Pu$F-#1RCBm{0`fh*(6N$Jo?}16vA`{NVsUFX7y=}AU<5SMOp`Hd26}@Qf
zO!W^9m*w3AFV%}!cibZpj!`*c|v3|hMipb|OZy9EQmd?&dC>sSOC2}A!Cs981<5fT0_D;L8#?=YG0UpmrY9nmzG!*w=YU49S_Ew$Q
zXJxps?$w@UiC(jRnf*U`Ud>dC!%QMJ%r1!t65RY`ov_
z$U7fybH(x+%!Hkt-3}okHRJ(hO-=ue{3hYR4NqJH=gANHi#ePG?pDS|Btp#V*ROf^
zy1Tm@m4)6RWjfu~oX@Pj`oCN%Den6VL`X+Rl@v1Q3lQ9`w{joDRyf9;@$GWWIgDw`
zVmh79fP@3JSk;9jnWrHkXVSFLhS%KI7KSp<`PW|uu(jOW+!UavF5$2Y{1O{sE2CQxaL?Es?H%$SRz#5T;$Mz+)i|T&|
zf^7Zu{Z=$J>r9OFR^d^t0iSlI)FHEjU?@_I^RFsmZtx(FkUxl$x1L*DTA;YDP6N;f
zrBeyisrkzng&bexaeS?j|8B+MFI$X3%#{G;&uqAc`kLf4VIu}T)gX){U`LzbF17BD
znHo4`QVM^Z3@tY|x8`+WpqK{mGVhLqIijadog%%rfJ|?AUDm@q{S*ws$bs64^kr20
z{MjFB|K8W@#LxSBt~qhy1S+rmnKL0~A2ZI`2uo&7pidT1_Ai<}h&U3=8H*zv?{=-b
zg^6ovZ-0h;^$ZbYfJ3C)0V)Knn%*oweB=n{zI`9MtmGGNXJ_w3CyDTdFMt2@>Xr52
z*Arvo<3?y9HVFHzS!D6z#huP3fOFQ^Ap}3q)?GQ#zQ6y=0aziwd%H0qOL$t_C{N30
zZEX+@_KG}>nloFT-mhODGHpmx-XvgF4snxV-MUb$vM)+7!zO?SrCaNEH)H$2+YE<_
zU}0m!FS48B=>PKN%U}U_HmQlij-zq^{PU01P=|72*?ia2udqz3R;@Zlqn$ucHm}uJ
z`Jt|wL=JyZBvse>uO~NC{XQuvDQHiR0Wo>k*rh>Ud10kQtxZSzKC$gR5
z*h>nTt1GpfSyUAX`tFr0S88@(L*%d)mdwi$y?WkR_GkXhnF6{b3Pw95%MT#@Wq11e
z`xU?vl%v)8>@Hq>e`*0`@d>FA+k|}Idi2{VWae!RFMvvn(INn8F$AD~iQsXHLzqiD
zBYbAEO@cJy*p*{6Q|A{GC}VKSc?GrT$&I2CU@!cA_RBNm?c1Y85B45}+vjW6m}P3IL+CwL|HVr{OA(z_3Zz6x}nA=bmRMM}IqRY_8t$+F&
z87ogy11ieD?*%Z%xP!m#_F)Xh1@p_J%%5D4zfk1bx%*vMlt*6+AdSp
z%Fh&XUw2Wc?_Xa3kqOybJ>z10(P0|gkSFMIivWrucc5G|a-_e
zDMJg~;}KXnQE6JAR+f`d-23)|8c@x@
z4Omhf>e7Khia#^rvHO4jy)YBH<2ce_bZ19pbR>E&6cF`$@k_2^gg#+?I*=VItpQQ1
zpsjCRzkWR#%B1zguxTcY_HkhR=uTK>5u~62QT+S6fGC4FY~G37K=09(CRy>}UcGDn
z{{GU%F?^3w#iFq|W4_F7e?2Ghly7S`30l
zJ=NkVDRG6%sQx_c_O*Zh8H`^#mgaD}6wTLj&y!nWX15xf2nxS~;rS{mJ`GU{1Hq)H^RAe-TD)Z6H(i4g+)
zpiSIm#W&;tY)+!4lYv_FVwZP(czP>FJClwC8Ms^TVO(u**xjQBxgYP2m^b@y
z-sqqyK)Jxl=CF`-_7kR!Sy~`}dqGa$rh@o
z0wNFlYzxh(^zqeXlbXNe$YuIjPL3ogO?c$yLTFaay}jmQ`-vD{fkGj)AIQiT
zey0(DaD-x*JL?XR5jpX(kS|S5WDY56HHnWMOHTii9Pbl?xOsCFAjwfWnSp4v#3M-S
z`^W=1gAF|-2oY``jqW@(q=H5i%YDmc1H@tV>eU1)28h}zd}vs~A~7?W2>K>X0*i(0
zL4ZAV4;8rh+#AImU*xMHFkHuksC&D{#O5XwhU4({WBASLIcO2;NfDJXT10Lk7;cLM
zg6cS-qX5j?2?ZHP$Cym<5-On$&U6`I)%K~zn+ID8d<2AcVz}rkzf1TV>hrPL!*u}X
ze*X#?Z;-~01y1@0X)NOs`ImKJO5A*rr@>7$z}jkR)JVwTtW=@wQc7Nu-=vxpbbzA*
z2uXw;LKh?7N_Y-cdih3PoUH8WiGic`uBxi4If|kdk;8OLR;;*xYJ-KfHCeotwl+f>
zb=+wC3NraM{imG8@BA>#sbJOA3r*Lw?c29Eq`SAEacRe=g|4d!E_x_xm%vKk`<2Bj
zamef*W>2ye*eC?edKGD+maWpVe=fq0qB~?92U59c$BrIF^Wqf-78Y}hljMd=aYISx
z!np-J_Z>QHY_&4&mFeYSaxCaKtZcrGVt)}wVh~N-?L9YDa5~;SpAk2MDWngKc#MwL
zzzAsK5B&b5av0E5br+|ucGJdTNDV?udFYR}1Ihe_Rs(0F(v;G@TjzT
z)>tTOJ7_5mRHaB4qsKXTR1dlS=x*+|HmjegDEA+90R+PpR;QOoDDOmWms@u$mxP1C
zaaq~>v9Ynh)Z1wUQ-G|K;GHfZr4aaorhx8Th_}LCJo71r-ymv|0LCrZX|?fs*K3na
zjc3;~cRg84`~I`evU4775X|ZbRd?9;w1THa12+4BmrYuQAFS6NCfZ<(769!fBn_m)n$T=
zb}abDjo-<4psW~u%t!?9=p^u{Gosx@!`pa3gw8Pu_gWlTkoG
zz`CpY1l_9d%i#bqhuI`Xxh?+p4n@CA&>7VDxAy7C8f*shDGpA&y=LpIOzHRF4_Er_
zDOF0+reVfz|8)+6bzdTjd1?s-%tcUcq%^9UPmGU0B(oX*VrJmWbjd1;<7gKKlMsn+
z%*wi1Hrc#xJ38KkISAg66
zVLhyMKQq}PHZ4}a?8pR={T=N+KAnMqfoz5gD2^qZC!@}0I`vvEq?XiX&Pd>M65B$X
zD9IC_T3YlonHU+#U9jf}H2l98>~Y$m5ThxiwriaiPRTMX;^N|hbmsLszn3bs?49vI
z9Oh%Uxw!#_YnUgpAkaMp9t16*5_X3Df_Lu{5KSX`j&$+D+){KzZo8FB?1le+FAXS5`CoTk^>}_
zmM>rax`{Qy()=@7f6}JSzp%n}axc0ImdzxiX<>b0+0H%-;a|{%+O<6KFBR$hP%m8=6dkXTW4fc11tv!-^p>Z?0J1opukXXqYn1?taLSdcaFb)9VZakIdDF=n>UF@4wy%FyXyf{bo-G!iLsj{
z#sJEX_vNt=9`o;)ywV&6h_N*iJtCG@}95Lqr{5VD&0^O`YiLB`0aHP{vx>7yC|+jgG$e{_anB
zZk;gs9}5A1mW+wE`|XQp~xDzcm$
zARQb=rhxBvpFAdx3y0j4qtQq&a7%A*EK2CR+}y+9OD7Mbo#3A4c(sAoB*
zViQ>28^HVxLec3&`?=E9C2N)NhCYbasyK#z8=1jp^Bvzn@G@g`a*zOpR@U6n5rKsx
z;yl52!@Z3`>>id76Umzg$jy-MP|u_D7UE?p9=v^}rKRKe10Z}D_yp`*g$S+mWYXgX
z*t1(osH-7e1p<^jw8yAkt(7ZR64Ht}FC%&u6H`E4*hB6WEaFi(M|0fS*%^WU5H=((
zvz@?8NN)0H&z^O6R}6uRAclGH1Wak^=-|YaNOdFKFvE?syQj$l%ziPPKpEp>WdwpX
z2)n#ml0IYz_;&;BXzwUbXGW1|Z3Hck(KiWho7q@3gu>*e?yEq(i=8
z6zgs{x&h3(px;go$ASztIn-x4)osALR6=9#u5jCW_GGcS0%
zTwH<*8HUodL0em!KS&X}{2LvYDeuqxpzC|d#!Hj|X}pc?q(62sCMM<@YZEzPT@ZFj
z6g@nTca0cArwHgte%2^AGDA);HJB>&nj7Q>PVqVnF9J9}B`7F(W&^RW^Z<%0_xd2C
zyhu+s+@KSov|}49D{H}|mOK)_=U#IasutTZ>-w3uTT#w{06l04sjtL_IhqT
zcsnG76@E<<0P_(<{JuLQBO@us+TLs8UBSq5Ec>Hgp>Tw<6(J9=iR=2n-@nQ
zjg+EPOJh>?&`fn3bIGS+=Ylr|9eJkkbRSa+TD8a`#6yQ$rgy}HIpDM2W*`~J;CHl2
zKI080`-sr{Srr)T-T68Z+nx<*=G_PLC+O5|FE6jXhQ*sZFJ{k7r;|-6ZAn~)Zu{M+
z9JkSbwNYGL+(t|Vfu@I+e4CjVe)EqeiBrf%Wv~4OFZ>ge=YH$f@kG-@hyL!v3sY)_
zn9V@MLwl2=rn8p6+Jh`|N1CM*XKE5BlJ+XD5SI`S`F^MfbLlB_?Nm4(l!#L}%SFVJ
z9m2xua56~(g}|tH#}A;Mh^DXvyb`%KV4^xopIpzH8rJAcBiTU2rt3YVYSL~?EEE*T
zvVX2FYmlGRb%1rT0$o_;|%zEA{Gwe^ukv(ni3X_2OPw)-{WRxW^~eB
z_6++AonqO9sOp0W015YMP1|D?o^0UPzg!bTd
zY6DSIF*)Rf?hfc(_^M5{48zgZwL(HdgrpPk9k$%euJm*hv}Gelmtz#f&cPvH3_;87
z_H88v1v>CE>D8Xax5RJsK0&pb$J)FcYCnB{`TN=@oeHMzkeC>VG^sdJ&Yo6TMu|-^6O9$dYz7AAfXLN21aF?Ve?*ibsh;h;T(05A&*T0*RVm8Q~>~Z4gIrcPad|UX$7r3tlkcFM$fN*0Co*0hNuWNG`YWq}55?
zSi&eRuEdV#7iK2dV}&n-+RS9>LVHVZ+8(R^w3kUuD%e
z?53d?5D;(&BBH?tzwsn;swq_O$@$L%fZJQl*H%ms=2g>Gh2mM)W2%FYJ4j!}z;N%J
zrnB{zj7FSoGD&_0UR8>a*#_xQ#soX56k@7zl%JnpAKn1NNMufI!+@SqY^k_6B0@rp
z$h%3McGbqfbd`PRu`;XAu;Y1E4XZ*!#Ms`>PG7iZD%Qz9K)Ls(hlhauNRO%4K@$^`
zP(2s;7yPkfco5&ud)=|aDmWBFZ}*{duf`-1DL|TKo2Y30d0X3*Hxm;Rn@ELz6DVp&
z-j;HWud<|syFiZ%lOr>bmH$;Wm&IlBl%R-+h8{xFc;Ui@XGXJaPXFGXtH@#O&YoNm^N>|4Y|<3*m1{{tp5lp>b1Vm#?#&)O7`8pbMt0=
zJF|M^$=yvbLfYTpYa_(VH%8AC4o#6xKKc~pqVVX8+Z=wAy09rFT1h@+qwf@
zeaRkY?52%-bX_5Hsr(5vy=8}Og8US4EsD<9Cgs|xB2jYS*5rqSp5)Btg7pKY}G#l&4w@b%T&BbBB
zOV6-db(GWaTEA=7c2J{^fpZHCqnBdI(xr)9h81_Gv2G_24he-9u=470x|gi&CP({x
zZLICkpgV*bq2Z$gnHExo{+J-ymPcWTQU
z090NaR7-vFA|y2yDS%jS)(C0-
zKR8&pCmFktcxVZHS!gRI#}htJ0a`tjaLEDzV{)QPBx5Dq#uiMRDcH5}aDjUw6hz{250e3j`lZN85c_pgwT9k
zc0o&16H05|E*2NCvC29;={ODs-anUtP6C-iTq~OW0#YBvu@S<^5DvYm`gp+H#l7`Y
z8_>P6cGIRv0DMWHfr1hS%VzjO-X0!u*U+BbcCROEg|F6@ys9b{6wM?qyf<;n2<>Cr
zw4L@zOHrDy_IY-(;mwhpI$0TLsx{NXI}d_*ZfiY1d$;K78(_F|1LGlvFll|)g@q2|
zSVWCx&e`g?Q~c|19Fs%)6v&IN)-=3VZ#w<-h@L^4>eYKwBfgovewf1gn5vHHrr(hQ
zl@aAyO@^&dzX=%~M#Nzejfr0c)8<+qs6>1!hcV)|904QE@&xx#e)YM{32z2cB&M|cooC*;W9TjM~?v;OX7kLJ^(tPwAE1qgM-i8Wxdzn
zCDC$arz%g!eDqf&MRKQIB5arT*S1L+HqvR%V5gbCXqH(k$8eD8Kb-*Eh&T2B+Y4~$$dL}v
zZ<)j_6gaiO*Y_A-?Ric*ggQIRpXphCQIze%Yg~%*4G)j?HbG7ytsr=B&bds|PX3U`
z&>dANATm$3TU(_V?_@ZMx7Jjk=O;=_Xz{WQ!RVn7m19}C=m}mXVviGRIMnY*BQMvV
zMP{Ugn$P`wTu=idL+NI*&|kC&Y-^kkSUlV1xKRkUmrXsmXZPL^X$j2S)k
zF3Uvgh|UAno#+5bR1-h{$rb(fA6`EdziWwT9tW1=>3-jbHn~n9#*%N-K*(Rwa8lWV
zj@;S`89@Lv13SEtWg`w<+^-|BmP3?q2OKihxYdBo=-s}#WY02mROF?Z8d$DMJj2%}
zEhgoJB>?enU)HpH-7V1P=)BupI$8b`ti6^qqUKOqNCzx=nbRpp$aY;1&9iWxhoH)?
z;-+b#3kHj>qtqm;tyr?;orMV+0!Yxz3`orAsc;cNqR1lKd8(_HIPH~932=wV%hbEZ
z-D&TpC)n8Y`1g?$OeRcHFJ?y2)*B52R1p)M>@ZP|#|lN^Z3k;X>%s>VCUeWJ34?gQ
zUry@_CmUzE5jP2>WSrKB#37IxwDqFBMNHeG1YrFcL3T0mZk%&zT+Zg-4r7V@LNC+`P2~kNYYce8Q6j_onj4ewWDMX@@P(qUIsZ>%S
z*`rjlRF;VTuk+zM%#6N`HHX?;iI-Jx^li^cAJC?YC(diY|0_`W*42iyGoFK0wO=
zwo;6x1(_rzla57xZ?e({ixx>jP2T9Nw`ke2WoM6xpIYg#5#5GDfF1Jc8@;_kK2Kx=
zFeDb|Cd6N+BdL$vuJQG3fi3OCgv^js!vv9h6wFKL2L@ULAQKEzp?FIgY8=BC`$q`Oe4
z8Mm)&Z9r6fL}eW)gA4f-9?E
zYG56`+r@6_R`uL;)AHq8+rrixTF|UjtHT7t5>(zfY>elnrF#x<^9%UAZIE@MhYb&}
z=dJOSQeVG(KAK9qG@xbB!wkC4Q7Y-64}o+30Zv=aW_anb<#XxX6PI7XYxa2b?86DK
z?4%7iK^|-0%IVd;*m0y5XX!ZOZFE4Mnp;|_%W}Og36Fddgn+Pgq0{r3Zr>P(ax?mq
zOrfL2&jHa~9MisKOaD7X&FhE9@5{|ueRq?%>(q?=DL(UAxUZ8u3g}wg;S^Q`c5MyrnWTOlsozS%1X^S
zlb43yuy4nX419jS<)3+7hkr=Jc_K<+Qs4xk3EaPb-xh9d)%|Kc?d1~zk6w@gMGvxc
zVeJ-Y9o|N}DiRG7EDQsvwh<39y1V}4_j1Xs^VL^gkp;n}z46LH0!{PZ0AQjYPJq@&
zIqo;>CCQ}fgMotK4}aWZiu8@jw-pBd9xWAh6R3dO-Skawllt=pkCB>SwKz@5XT99
znYTXQV@CLp0=p&cyJ$#@nY=^DouU(f4}SXdyY!;Bf8Vd#>zw6q^33B~mhxJaU-~+s
z=j$)FOYzf~7qfD86GJEG)l0Pv%ig>>z9R4CxxM<+)oCQ}p5WtJoSppc40XZ;mzu`K
z8v6=I5lQ=_Z|h&;GVNCzBGQs18IR#IAZp2nQPD7sawAhWFS-KwXlS}2zo;mn#OVuX
z!Kz-9XB-_G59=r3)fqvpa?Z5etoFdE6J9{pUK$UE);r36)hXHZ^wAi=j0)=2F*y18
za;6^QFM4wG5;6!UVv$EW)aTB&eS2vTHtI0g49FrTs{HNSO!A<2*kHF9>(s97A*EGh
zhjBD*Ptp?*NPqs*S9{vG`~COri(j9(pJKT5=DFe?XYJD31~#+N)*@9NAEetaYgg6X
zfz`+m`Tjn0kAz$ya@j7ML46cEC+Ro2@I!fT;bFz{nZ2f;o7!@-0WtG7FGb|!#xu&Z
zZHCJ^i@@1-$UH-lK^^LfoL`}5E@fqD&zN4SH0`@2ol6Ef%tg)X(Cn7;hs;=xt90~XcO+Zul+RZXYbBeKDqo)4$`@j*LRbX
z4QT~uQ%7k~W5UusIH+C@ZRuB+UWyP7b=qz616gcGr=GY}@XU<7ZN-O0f>*JZG8t>2
zRh1a9(`EC94IA{@Nn=I-qNAhZO>pJNR;Q%x4!V>Bqbl$WW&rAtk}0RXDgD}nx12^U
zE1aNfTRR*fmP|lNxz_E~{rTEsr~nV0uPPfSi%Pm@;?izwHLCLI<~}s$-ETi9dzz0}
zSfVoOk$3%*{2#8aUKO|~s3=($hTBrjaF=WVzZY_L!c|pGPE8#rrLySjNWsG~&Cq#^
zzHDNam-Y~(r{UBP+_vHO^cr?8D7P?_g}j_qI%m$D@`|U^
zUnBfyN7)Qvn`T;FD9XmIbuVw)XSMW82uUi3ywsuxJgy~ypMCx--^
zj_MYDZ=f`)fDIyR2eEO5P{w1*rbR;QtHAKXOvB=v``1c!2?a~6^MoKo$eQ2G1!;`%
z#DGJMmu|`4WAMkwk+!WzZ}o7#cTqZxo^g`=n{FRS<`VcBoV|jR-0Uud%x{(9{h!)7aPq#YJpo7n1E156eP-El>8tz6F(;f;`LY{?
zQs9+?_Fy=Co>ez_2OWm3oJx9h-0e*sMLj8YIMuptYO_7Jd>}tdSltKYy6dX)7z{tAy+$97dxyM36ip@0
zaJ8HN{`<@i9|z4s?Rsc@yH!w-=!|X=sUw})sjmk%`~%P9P0%9(?DFVz=4uwsy`yaZ
z8my&(_}ALCM^i~*?!cj(Z!?_+A~k}9%o5xi2XwuvGK+l8GJR5Muay$htnVTJ
zqjHMvxqAq*1L`CHSEp#nowWS=GA5(!*@2n0-*%W_7!(5ZIfMSi?dM=@5oCvzpLS6<
z-J{D~_2mlz30eF`@CPZDyn|1txbOD0>3E_W~eml&b>@A{E+5dhy~wv^a-x+pcx24wyQeiMjL><
zIXgLJ;`emJ=)|bx)K#2F=AgJiX$h@=Y5jHuUkLYwpm2;8V`)lc{oTOa8B9C>B)zY7f3F
za-#T?4(YqEtE=)m%`Y?Z%CA=t{1f-#`XNrYdo5bEXffC2z%nKhWRs^Yne*Vm0|)=b
zh|->aB!4=tvF)!nfOVA
z5puA4$RHi^$MJRRKihWh{=kW=X4@A}*i#gDXNQ`GG$g>wp-}kFjf5k{yp}%#T`{nn
z(CkC?-T3k|Jxab*S7*%HG5(hpA8Lo*cf5LH)Ww9RE~9&?P0LQXm7po{O0SDFfZ!PyRZ3I_p>4_N*GY;e>TJA}Qg*{o&F_dw
z^z?+~k~zD?pE)y?5UOrb{d~{T`1tq*n1!d8KKj~D^O*Fq`J8S($(4h~xi0OsudDUq
zSw|8RS8ptZv+!a@^{1WXF=>}BjeG$0XKvJ~Dv~Pil#iR<2s*3KTl{bWC*gNdyf?
zIxa<~Bo*@dv;O9Tj-C8eK>yWpRm@iU!VnnT7WJ`=9#vFSB!q#3-UfTato+;;J!Drj
z&0{Z}vrnX2cL}KdX!S{8Kj&DW71NCl!-prP-XoVdd}ZA|vT0G?q~=tMduVZXPeXO5
zj*L%G1lxUDwZm@o==F$-3eIKT#5Lh*)s0$yudNE)L(|@JrQ_~vmh&mkcQ5WZ#4XY~
zf8wsb`>(tgVub5`VIbs8@CN_2nmvSC;C_+^JtJ>bXeK0=5%-hY6j==$)8qA{BPv3?
zrSUTOG|a$Ah(gvUzTib^6nSHLuhrcR+v3Sm+vm2!PeLX|3$v+H!#K))bFV#d{FBo^
z-2vv0$XqNIJ|d@RTDucKgGNj>u1BY>HNKv?weM?1cUDs2`_VU7n$=T@n3g2Um8gk>t4Xte_@=8M=K7MHwIc34dz(&ZT!>k8RkIr34KXSV&3B)8w
zA3ZC8#Mg
zgvco+u7{$g{Y;Tkzy9EX4ZU2A@&;+7pprbKm(JVkJ{lw~VfG4MuYrNVpwlEruJkx2
z=aHJ&Nc>}&MRi$~6UU6PaVv}BX;3$9m0G>y!6Grvf?xh>(=vs^I%n|_XS%@C$ymI<
zE;dg`K<6L#so?eN>D$gefqT^Z#K>y^jcQI#kw?r6;wm}B_Ef(fC&1G&7V(S|k5IZw
zqc_sAfL>7%4Iqn^=Yxaw@%MLHB%N-AmzuGqXcI
ztdQ>O=S}C8#L=8B5Sw2mR(lk3#>cxpxl|ja`1+AY)K4LCEsQv95zQuTT7FyOsP{))
z4E^qUJl)vccgK!N_IXdA&K?TG2DRa+IOk85mGddl(@VE(X(&9Lb?erJo*E8nC&qE0
zEW3@ntpbfk%e-`JKQuGAUfI0eMPE^LzTO9c97+I=dVg3pIvx?za}K}z9{FAaPl70E
zoT`?iAfcwc0k#q)r0u3u`az$?z_+KGT4?rrggz0=-@h!nzZYBGfFmh(wzfjMO1W{v
zH`!JiDIYzK{q>)h`~sIEQ;Kj=;V`7qRNC0W%4Z_jDjG#?>2(5_4Vd?t)GpzQ!Ceb2
zC|rfD_Qu7<(Jt6^p7jMO|ZdBT+iI>51w+Gt&g`$8gXEQuGFXsHn(hp9rz7@VqPr(ukX61HOCq96fhVPYCRN5-NnN
zK;j=-e;$(w%t66oY2k~KmRnKp(L;1I?|AYNLiE5QL^Yh1E6jkDbA6pb=o!usulnVz
zR*}up2cbClOq2wS63d(b1+*&Qp(zFII|9DpC)ahpWj9PE0Ax1=R&nvz`Zf?`b@HFj
zYm!Nx!H#$lc1ue|2#+38Siiz90FjJjc9&a=R;|W)9|#E<04Oo}vy5&OysyyB-aY~(
z{3spwV&E2aO-(am_Di>TVgJFHKR&9-lblgw#z+?Ex+v+|HMtdN%OdFZ+qqM0>bigFoTr;eV!Iuasl~yOduhe*mG}?~1C+)_j}9lVXcxZ}duo}XT_E3H7|
z)Q)F;1B|zF?C6S|eyDi-m%MrNrbo;p>>4iZ*thTX83DEK7O?7;=(hhYNhEi6GZL%e
zot?b=#8H>LJ2l6nvg~>2`Ssjx5@%P`@|8T8_Aut0Wy-0=!e-5yxqMk|{J8P5YK2J?
z@?^)q|NdL|Q6PnI2TI(_uJ4K<|80NvYl8+y4*K>@A)d{6dVenOzQRsw-aR
zX1U*p4LS^F965517>7M?n@@}Bw|_1_Ii9=lxqYJJlS>P}4vqhG|Nf%WL(nWoecDZQ
z&%b-lJ?8R*2MO7Quj5}2o3{_)_*OpuQ6?+KdEdSdFETHmAdEPF*^1Rm2SU3GHHvlY
z4E9YD&7=;sYuCP0HE>NoE}iLL*0FD^?CFWArGEiY-|5_GTIQg_*c%HFlw0rbhb3@I
zjt4kIdcP>0;l9uQynp7cYju>Vsc8q==E3EOA!X@TX-22M9H^m@>V(OmO>E+$f;fD}
z@3+Wn&;HKtSCFyq^_#bE2#Nk4x2GS3jDZMyI>&!{O79o-MQazNpBHTE-*6
z1bNgEuFEGt=UC7WQHLpoF+{adKs8ZsimgFICLvzOW-^R^2`r^KvOB2+#HnNqZqr}~
z=5M{#y*Oh85g5w2OkMugu@OR)5*!8QLZY4E_d6QL8AHFy7j8WDlaq8Ejp1Qdt?kNjEp*eqkzUa`n3z
z)uG5I4?oB-RNnND38+KpZ#P_wA`us04p)kR$Sa@
ze+;%niI9?{KLmU&)FlwQ!5&p-L~O{C6s{KSi!6a+@T>D%(0dAxLEgmfVXMwc-6FV(
zG$E!>-&?a*En)IDYOiY|I%bZr3R>=Fx}WLo9_4H>B}>a8wIWq%LhD<4_d*i}Q#;1$
zN=0{H_qX;voFeHjBg=7ea=OP}!UnHh;Zg(O2_N1YKz}F`bquLPvOrU(ha~oqX{!>;
zCBtG%@w>jr!6^cCfIHsWm;T_B4}buLD46Lq?$4XSYd9c=N_$EwOOXcBT`v;9rZ;a)
zI4T4Oen2yb0oNq
zW6ME}IiARuoDKAXt>SP0N633kOr=!mem#7ws_@)>fIO0(qxV;tMr!TFW$PP!`8ss1
zT)$@uni^MgC#MsX(O;oSYEX8wL`)}5+QF5boR`s&VFe3{b7Kxe%TqO!0a_TPG@Q`L
ztnr<}!#{*=>Z^PR@{g$BfB(I6QFVQ1kN7!S3WY`4=RS8lY9`wP%H!BhGqkM3CJ8Wr&dl%|8pAr%5PkN
z*z}+En**jWJ}GG1hnKm3;3e@mf}F2(W!2{oO%#ryqBe&!
z<*t1pQDQvAPl~q!joYg)boITK94&V_9r3I5C6r%}>nB!Ee)eI8ze3@tW?l7L@Zc9T
zy2|?%GQ79!Zp^%G%70ZT{MI6~3_cL$`m;1C>;#EcFDgyLTNIY_7^SN}{y4Hm!^z;y
zXFhiDoVDm|QMbZ5>4=*wOwG&=ifaHKTAsA$XEG>4;dyV3e5^{;y#IbI>cNAwcnohP
zR)1Lx(2r=jgX=k18HBBsFj7;EZdmp45BSL@vr^0L=EDV7P
zReP$d8zI?~|EUP;AQC3^r^^fe^U_NpDJ&g!DzCH-(>KhP>I9}$E~8T8zdWlyFAXDR
zLkHAh22JqbeC5U|7D1iwop=9{*bcY{*1TsM5!IeC75c6Pj=B}3bYs1ZyZJ;;Il?sO
zY}X)kV<{nQ8d!cORKtN?+V-3MH-AO+ZzU44vNI3=v)@Kzw0H!*G_ufy6{jf(8ctxr
zZ54$kd_N;^+&iblE`Idz@hC;{S!5?l!7Y@L(QxNl%Wmk;k&BU<|l4$$i
z{y|WD^yJC@VDN33b|acW>0eq}S_;>TZ6(IfZTK6L@nC+VpVftQ$rKLxOQ0{GJZ5>Hs-%g&!9)@_BCO&xd5th?o&u03uO)#=j
z`s|gJm%pR!r;H#rWP9O@-%0=D+QS0Y1)~g&qvA7QD(#Ll_becD1y$L_Q7~aD-h;*lL$O6Lt)ZmL>7}?zd!Zjzc&dv^{>>P_
zLH}70Z6!sm!+~%3DP#555W0gGS!s%{Uog|oBp*;SgvY|bfs)>9o7K=SVz$A4*0iCw
zl%#}Pd73j1!fl=m5#LW06342u81*({4TCbtCaRh~H$7uK3Hv0=i^^yu|JdU8=J5#0
z+hFpEDiK74M%fm8za3AI0m(vUweiNXs1d{9PPAi%MEwbtkH>kSnTd%UuRx{qK`&*&
zow+N$V+_L#Cqr4|uo1*a9)u#hgFN8Lp&1#4(LV;7gvlJqV1}m|C>M{kFd$}JThAwD
za~jB;?k{c=^J&-Y;o0R(r$l}SENs5&SAb==;m0dzPeAIavFg@5w@S~
z@`zpgVt)E{u51M9?qOyy&RYssQe!-e9x=VbGlEv%phm6x>Z4irJ}8$0U$q~zM0_W5
zj$(zY!;Knx^NzwOgRbZ0<$e1pyj|d3WVm;Y0Mis76}fyxSO@`7|ADzdd3oGV$DUPB
z4UKWIE6gcKRp&E_TWDLAe|1pUp8!*C)M=PCV1I&YI4ypZ>46Zu8cwLAc)pY}_gzs|
zlKYPBIrcds{S;lv+8T-oV>`P;bsBe!q%<|0%uc7AJxC(mhT#>8;8)_C;ySTKdPd<{
zo2B2p6nX2O_SZ4qkRBO@nyR66P5}3SHUGp+Uf=wBMur7?rx?!|78b>NDBITy*UUMf
zRTk;y&m43DK~^JxnAZAm1tZYi0_!hTM6-Cw-v{GsTHxQF!rxHdOsUQa#at
z^Q}in=LuU=9378yt0J;%)8W|PbQ^{Bn39s>t%_z=a%)&w19aPvMA^=pw}2+ARl@RE
z)o^&b(f}f_lqYm&5jn6#cvi8RAq;)Ijb3z>V8??nv6N)sNKOPK!f)Q2boqrPU#B}~
z^M%Forb-L{s;spB!3NV`$3Vq7$ZHJ4GiZQ`oMLB-rKdOmk!9KEMu(zev)R3rYJl_8
z7fGtb=grZZ|sI$ADx@_OSXYV(0>C;PYgK$#-x@;?&c5rH~R6
zF$NDViv$rBQV7MXnjzJjQng6uQ{H`44%wb7s8I<{H9J&6Jghmtr?5*K@lF(;8scyj
zyRE^0GQUlEiK`QZ4579v(6}38G%m$$DqO7LUZ-x|Ar|HQx~~J?B`}{2D7T31?X8MS
zV**N*vtW-JS>tr?P(YAw{+kl>ca}qtfPAt`I3`rKO=<
z1xSBRNjB|
zRL$Km;aCExt6s_rgP6!_9`u1ho0DOiOhG3Wq0(Cs+*De)Vzb5X$-m#KO`CA&-||-a
z_idvvJ4#|<&ttzw>gO_pEhoGX3YWIKShF3aEhhVnJg)FO-XJ1@ts260nye+Bd+tsP
zX2G70DE_zQpFN#&nAi|AE5ERypqL?$;utk!f6WO+cl_?*{Subcam9@|p==7lN{|8K
zOUhQNm0w69gz^p>Vfq2j@`XPPWKD)vaODP>C|#hK9{*vo-}b3h8EZaur~
z?E%$r-hh}qiT+l(@tZ2FFQa$bx3X-baCO)lhTbSGLV(-CNRW_O@3{P=udbRx-8rSqQy1g(4os424t|zg_)?~0EEM6?ggdz7j
zvHCo+fhYH0<4A~PL}?m{8$v^4yj46?uCTx(4%_Pt-#6}uQQnzu;zdgQJhJkt5$6hp
z*?94f6R<$No%r?ve%=CDFnW26T#<^Z8;+Dv3DtFV&!li-*zEN8KZH;`Wzp#cge;Kj
z9&fAYatPibPr8oUFP_EoSQNVlxg0{4WIa0(e3c1(QGAF!`M<=|5%p>Wzn9PedJNNA
z_*s4*Bd^$M6){7Dd@54z+!W#{)FEwaTmj6a>8e&r4IU5
zim(8Z>1TS|nDrPY?&8uZq8qw1_8pA^dJX+)VX&y%YzHk+j2530^J8`XQ>Qw&uzo^PVBp6XXd7`tz71i{*&fCRj(Na5S3%7uyjEjc|YvvOXHRhWbfYX2L(nPiq2y+Uz7IP$EHro!Y7=+s0E
zqv`D&m!z1=ETk)Y^>O3#AD=dOKuaUt0o6SM#p?(&Kpw%5_lr;+Z6i&J>(H=pA!Yyj
zuq5Su<>z;W&4h`23V(&9TpWJ+tC*v(T!8e(ZAoJfS%=VmBnQo6W~e%wL6&BRq7Of{
zu@d=lQ<74Tk5B3nm_({zVYB8Ip9uu+r{w`9NxTvlkn8x}nqe*=BRQ
z^Mt3mA5F0=w)0S2YMa2+`@&cqbGA4O;)3hP{Hk+Yf_MUC
zo=Ex06WP^#`SP(baCVrs{7o!gO-Neu$h?C1@PYb|cAM1$w9E&6`?`0)j$kh1YY
zMr-3TToG|w>=5Y^PxqF>g7pwiQY8fdxkh`~#_pH*NrwdTftiQL`SMB%2OA-{uBlgJ
z&PwSWVvd4)qIq#SAmvR$lx?gPQqwL(y;fq@b78TR`NGdX{#MR#j_$FP!t3WM6s>>q
zvoHQjx=_GV+yi_O0cHunlMDJ$2v`s@_80`>#jF1Al^@Vk4goW&=T4w*gbF!G^cD>6
z=vhMHE7cY0wVx`)k_F(b&1Z0=WNXpP07Z=tDo^4m;l}XC+>S9MflzR>4eG5a4uv=Z
zF_1QUbY7{Ev9V^X4G5xiMy(eJ4W^!6QD(L(7wSCDrKLa@jWj9~{s8C$~
zfo1AD`0m|1efGLEGH>6hYN7pUe+$nEbUt_C6q+D-+q&nRkoFhkhABN}NN1MpjZhLk
zGO0%2DXcRpsZ!0FGf=7RUG}t|vfg`aMHV|BwL`kOjNq(U}k$hOK7E+Uo1*2&KyF-l$^{pZF{0P1Dj>#O!NOMfi#sOGp$0&pfp`g42i_EX`tTt&WOds~>b=&)pOcV_{}%I9+tWO%=rd
z*s?90RXi@$nB13|_g+$vhuZwU3><>0^<_jUtxVD0VO(XFaz?Smh2w!=aGR-y6s^G!
zjC|d*_T4JjMiqRPBV=4;dL@@JjRem}Rnl9x_=J?0S_5pTi2=!MPv?myj&sTHUT^5e
zIBrm(nwyz5A9zu!;87ImLDQ*j)H@YoACS$}U~zw09j73Zu0OpkS2qF-qKZGJom$xQ
zgL+S%qJ@|~>jci#2^2?D0`ipk>A04#al^6;1+)ZF84m&ScMVW77RCsVua)yKdOLgN
zJdJh{8m8}~S(l1}qq2%6CH6BQ@`FB5&b`1ZMz+7IA?spEvcXM#2Ood`Zh>jt;vYnj
zyhR*8{w7)njR3f@FFO}ajrh*h^5&khJ3{XoOzt!;2j*4?mk*?3?)=RqsA
z^U-v&n;HaxnTc%;CsyZLf&?iP9zH=qFIVcpO4Vwh+pK9*X}F};*HsJWC=#-V%MeY?
zNawXRT}=RySdlcfyY(D6(1Nugdq&Cwj=~{%rI^Ej&I#$eIr36Sb*OJQDht^V`mHV`g2KK5*SBuJvE6e!WfNGdWLO
z)fkgCY4qsP;Th93TS6NwhV~$gb1iLc`hHX9#;rNq8<^EC-bN>IJQ|B^aA)wfe&0DR2L
zEC2<8bYW4@k7ivD$n(rfi{xaD>+$v-#t?S^4Ufu!2{~mT!xQ;>+?Y6W5Z2OltP}zQ
z0%!=HiA)n<4wn#`&Sjgv7?V*}f4Y+>*e}rzKL!W$HtjCFuA1Wcd$VRC!d}SE*iEG@
zT|`bO8BxtVgaMs8HC1eeY=3hf#~8f
zO0=?@A!=&MOAOs<@ttJQx_WZM;DxjmZjs|kw~nbKd)RQ+CJ6V#
z;Nli7TgnX+%QK-mV<>cDa^3(KL1KT*!!X*@%I_b#V3cnIpd6($^XB=04`q4b%3UBA
zsTCR;I>_QK2Z+V@=zaGg#~ybk-Eq!eN*NF?VEuwi+23xfSg0WB(^kAIcFjdaSrr5N
zw=$>Z1Ve}`PQzY-zdv91i%A#8PH?&W5zlGNoaXYFPv@a3d9hHcHTa97@fH8A@AVwT
zrv?hq3l{tu$5rsgj{rR6F4TdEqpqzTlkG<1M?P_?rep`)6(=hQWF(w=s5B!
zKW8~T@#)VralP`};+wI`Jwu=3*@=QPtrumqfs
zUW3&W%BbuYP^ESR22NWi>7{qUi(~#QJQY`LuGEbh
z2CG+{|8+JbrNM-a@KQGE0lwSv=oE1vBV}IdL}F*MbMCx(oIn$ZgK+(kqT7a-VhoM|
zQSNNno0*7#%>E1lX|zPx-8)8R&x$81WwUwMVuj+X=f5rm`VVL-hmqb5vA7GhHe1pF
zy=z%nrj*|%*yqGJid#r;Bk>W9l%?fQ&fCQ>7*|6JV>-SC6*50
zfL#OA4$(+eA3nS~S_kfF{l<-V0_qc)=uQ`wv=wpX+?Mmu4}C$V&}86A&Xu30^u(He
zE1@23(s6+^rdM#7*@Tu(3w4TM8gPISO%A)o{1g+)bjlqIoQ+ts=XT
zkm<7b|54ZQ7T7}t6vJYMoZp0uqH%vGTYK@&K#}q-hZL$YKtqww0Xte!5IJ=)KZ=Zu
zOucs|E)MhS8*xwl2oe&_!+No#MK?zZg$C1a%Dtz6-T;@3tg@m+narG+P_NgHx=M}+
zr`i>d)vH&Fybl0*MyOY9MOgWdfS&Cj5+fmpayfFv#v&iUY(D#0g1IBGqI8{Ry&TUe%caU_gO=nQu
zr=+L*$NZhg1jTP$09C>rL2;6we>44eJA8aXI61k$RZK|}AEI#%yqqC2!B9fTu$5;(
zDhi*<+wB&tFz4uR=F+*9yMB1l3SvG&r9tiO{zo0+*xv(4h%e>IVYknQ#dK
zNzn0nu%+x`Ecqj;H0B|jo7Pf@;!Jc{m9O7%ga|kETPw{&ZUe@nrs3J={?nMfe0ISr
ztkfUH4SqxpgweQl&o(v87PqiblLQZVsE#U3nw&J2I%d!65<9G7ji@uvMSATxvg+8k
zelKB5SWEy~NnX58^EE%V(>wqF*ILTPw>8`b=_s+Vq(&zutZT8IMQ1$|)-UDmQQz|%
zpyq%P7cS_}wY7yzVs4bEu-}U8?@taB3foWcAaNyJYT1qzOL5Fyh=7txDyD`Z;;2b1
zG!8S4G+&G)0EJWUeZ)NyDCOG$C2(R5!%!GF3|lh5j_D>PW20o2XtN&+%&Zd)2GMr1
zx;PfYc+ph!J3l>twOFzhbdP`%WEG?X%H#JLlmT%hli>x_7i~N6NP{Q*38stplXdti
z$~_Z-46=vS$kzLjLr8T5{mcJeZ&lSRanZM-jtPNUxn|TUkwtS$#KSM}4tbT4!yU{u
zxUghy8CON|A=vEW-@AMFF=U&gli0h9esmMJxcn%ZoLV}Fi}Q^+V%jNzV4+UxLLE_+
z5XhDg!{y6#s>?02t^=WYV{Sn@0HG&@r~@gST}vyUg*Es!jf
zwN436?(|}u;(dL)Z8c#n{??~o
zOooszCELXa3J72V(0nG+3vf=QYga~x;B7TkrZg&}?YWfludgsrBMnn0{(x~F7SL0&
z=!gzKG8FT!By&JG=`D2f^Yi_Dd`y+uG3ZC{NheW1=6}m3Df%Rn6~4UI;=N0Ht(p*^
z6?*GaEt7#kFwH6FBeQEmY-7%mG-BsHv7-hEFP%ESox=0T4~ClK7aB>z#wx5ING+3z
zzyQnuSQ60d|Ex0PNhq>si#vo=KrC{j(t$cUQwXHWCDw~$Pddc>qLcIzImIKh-N-h?
z*g%pW(lPNCQND=>aLm(#K<=G38!(dPd(t2Nk-aBi7bQ&dSQV%GL^-^wS2h=Aa?f
z2Fya)X2ypFB+*kmK;$BSYu0LpGK)2-EeK~C0zGtIuAxy^F#8Xt@G~lDtoV1EBFnrz
zxjKz(B1iD7SS$YI0Q*TjZL&i2#k39mDea&wSQIby_;xBTJ>Th{_3PViZ%!YHNp#95
zLNS~xAT;gR??5{J6
zX7A6am#lc~EU6|&t3(uuj?@`%qjg9$S+XusRlpC8#(vU(oX(*yTHmvbFmISzLvbNC
z?1fizgwjW)b1fHX^;5(&ggfxxsW}lN;!AAl5gec3a!=yHib6W06-;zS`9RxOe
z;Cyi4j{`oqz`jSmU_M(*(ZgoK1R)`HkAKjcgSDcfVsf6)km6MCBC&l&6&}4sNIvAe
zIZP>KYRM?+ey3imqX<4gGdz|&RSq!x8!dub{ZhBC3^rmzpDjI=hkwL#+0fZW|Hc%3
z8~=cSa4JByDU~^RbgtI6dcXd9lgZ`JH1BO;A{Hyqnka9wlb?3V{2kELRTu*c7e-1i
zgoSAKVk#Q1$re{@c}7h6wj;O#P91pr{`aB?KXPo5*YyXHQQ|sLR0bk3aa%NzHz~%k
zq)-t{tQCrs5_qV>k&to>#VGW9fF_^`Y!31qHvMO(M