From 3137991f55c9372d4743154a56933a37e47feca7 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Fri, 21 Nov 2025 17:28:17 -0500 Subject: [PATCH 001/129] [BugFix] EPLB + B200 + DeepGEMM : Handle column-major scales tensor (#29162) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/distributed/eplb_utils.py | 49 +++ tests/distributed/test_eplb_execute.py | 40 +-- .../distributed/test_eplb_fused_moe_layer.py | 285 ++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 41 +++ 4 files changed, 376 insertions(+), 39 deletions(-) create mode 100644 tests/distributed/eplb_utils.py create mode 100644 tests/distributed/test_eplb_fused_moe_layer.py diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py new file mode 100644 index 0000000000000..27a63e0215148 --- /dev/null +++ b/tests/distributed/eplb_utils.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import random + +import torch +import torch.multiprocessing as mp + +from vllm.distributed.parallel_state import ( + init_distributed_environment, +) +from vllm.utils.system_utils import update_environment_variables + +mp.set_start_method("spawn", force=True) + + +def distributed_run(fn, world_size, *args): + number_of_processes = world_size + processes: list[mp.Process] = [] + for i in range(number_of_processes): + env: dict[str, str] = {} + env["RANK"] = str(i) + env["LOCAL_RANK"] = str(i) + env["WORLD_SIZE"] = str(number_of_processes) + env["LOCAL_WORLD_SIZE"] = str(number_of_processes) + env["MASTER_ADDR"] = "localhost" + env["MASTER_PORT"] = "12345" + p = mp.Process(target=fn, args=(env, world_size, *args)) + processes.append(p) + p.start() + + for p in processes: + p.join() + + for p in processes: + assert p.exitcode == 0 + + +def set_env_vars_and_device(env: dict[str, str]) -> None: + update_environment_variables(env) + local_rank = os.environ["LOCAL_RANK"] + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + init_distributed_environment() + + # Ensure each worker process has the same random seed + random.seed(42) + torch.manual_seed(42) diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py index 0a97749ac318c..9498e75b279b7 100644 --- a/tests/distributed/test_eplb_execute.py +++ b/tests/distributed/test_eplb_execute.py @@ -1,57 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import random import pytest import torch import torch.distributed -import torch.multiprocessing as mp from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( ensure_model_parallel_initialized, get_tp_group, - init_distributed_environment, ) -from vllm.utils.system_utils import update_environment_variables -mp.set_start_method("spawn", force=True) - - -def distributed_run(fn, world_size, *args): - number_of_processes = world_size - processes: list[mp.Process] = [] - for i in range(number_of_processes): - env: dict[str, str] = {} - env["RANK"] = str(i) - env["LOCAL_RANK"] = str(i) - env["WORLD_SIZE"] = str(number_of_processes) - env["LOCAL_WORLD_SIZE"] = str(number_of_processes) - env["MASTER_ADDR"] = "localhost" - env["MASTER_PORT"] = "12345" - p = mp.Process(target=fn, args=(env, world_size, *args)) - processes.append(p) - p.start() - - for p in processes: - p.join() - - for p in processes: - assert p.exitcode == 0 - - -def set_env_vars_and_device(env: dict[str, str]) -> None: - update_environment_variables(env) - local_rank = os.environ["LOCAL_RANK"] - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - init_distributed_environment() - - # Ensure each worker process has the same random seed - random.seed(42) - torch.manual_seed(42) +from .eplb_utils import distributed_run, set_env_vars_and_device def create_expert_indices_with_redundancy( diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py new file mode 100644 index 0000000000000..55f26519887a1 --- /dev/null +++ b/tests/distributed/test_eplb_fused_moe_layer.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Test that the interaction between EPLB and FusedMoE Layer is okay + +from dataclasses import dataclass + +import pytest +import torch + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace +from vllm.distributed.parallel_state import ( + ensure_model_parallel_initialized, + get_tp_group, +) +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + +from .eplb_utils import distributed_run, set_env_vars_and_device + + +@dataclass +class TestConfig: + num_layers: int + num_experts: int + num_local_experts: int + num_topk: int + hidden_size: int + intermediate_size: int + weight_dtype: torch.dtype + weight_scale_dtype: torch.dtype | None + column_major_scales: bool + + +def make_expert_weights( + layer_idx: int, + global_expert_idx: int, + global_num_experts: int, + tensor_shape: tuple[int, ...], + tensor_dtype: torch.dtype, + tensor_device: torch.device, + is_column_major: bool, +) -> torch.Tensor: + assert len(tensor_shape) == 2 + + if is_column_major: + tensor_shape = (tensor_shape[1], tensor_shape[0]) + + x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device) + value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel() + x.view(-1).copy_( + torch.arange( + value_offset, + value_offset + x.numel(), + dtype=tensor_dtype, + device=tensor_device, + ) + ) + + if is_column_major: + x = torch.transpose(x, 1, 0) + assert not x.is_contiguous() + return x + + +def make_fused_moe_layer( + rank: int, + layer_idx: int, + test_config: TestConfig, +) -> FusedMoE: + fml = FusedMoE( + num_experts=test_config.num_experts, + top_k=test_config.num_topk, + hidden_size=test_config.hidden_size, + intermediate_size=test_config.intermediate_size, + prefix=f"dummy_layer_{layer_idx}", + activation="silu", + is_act_and_mul=True, + params_dtype=test_config.weight_dtype, + ) + + device = torch.device(f"cuda:{rank}") + + from functools import partial + + _make_expert_weights = partial( + make_expert_weights, + layer_idx=layer_idx, + global_num_experts=test_config.num_experts, + tensor_device=device, + ) + + assert isinstance(fml.w13_weight.data, torch.Tensor) + assert isinstance(fml.w2_weight.data, torch.Tensor) + fml.w13_weight.data = fml.w13_weight.data.to(device=device) + fml.w2_weight.data = fml.w2_weight.data.to(device=device) + w13_weight = fml.w13_weight.data + w2_weight = fml.w2_weight.data + assert w13_weight.size(0) == test_config.num_local_experts + for i in range(test_config.num_local_experts): + g_i = rank * test_config.num_local_experts + i + w13_weight_e = w13_weight[i] + w2_weight_e = w2_weight[i] + w13_weight_e.copy_( + _make_expert_weights( + global_expert_idx=g_i, + tensor_shape=w13_weight_e.shape, + tensor_dtype=w13_weight_e.dtype, + is_column_major=False, + ) + ) + w2_weight_e.copy_( + _make_expert_weights( + global_expert_idx=g_i, + tensor_shape=w2_weight_e.shape, + tensor_dtype=w2_weight_e.dtype, + is_column_major=False, + ) + ) + + block_size = 16 + + def block_quant_scales_shape( + shape: tuple[int, ...], is_column_major: bool + ) -> tuple[int, ...]: + assert len(shape) == 3 + if not is_column_major: + return (shape[0], shape[1] // block_size, shape[2] // block_size) + else: + return (shape[0], shape[2] // block_size, shape[1] // block_size) + + is_column_major = test_config.column_major_scales + w13_weight_scale_inv = torch.empty( + block_quant_scales_shape(w13_weight.shape, is_column_major), + dtype=test_config.weight_dtype, + device=device, + ) + w2_weight_scale_inv = torch.empty( + block_quant_scales_shape(w2_weight.shape, is_column_major), + dtype=test_config.weight_dtype, + device=device, + ) + + for i in range(test_config.num_local_experts): + g_i = rank * test_config.num_local_experts + i + w13_s_e = w13_weight_scale_inv[i] + w2_s_e = w2_weight_scale_inv[i] + w13_s_e.copy_( + _make_expert_weights( + global_expert_idx=g_i, + tensor_shape=w13_s_e.shape, + tensor_dtype=w13_s_e.dtype, + # Fill data in row-major and then + # transpose if test_config requires col-major. + is_column_major=False, + ) + ) + w2_s_e.copy_( + _make_expert_weights( + global_expert_idx=g_i, + tensor_shape=w2_s_e.shape, + tensor_dtype=w2_s_e.dtype, + is_column_major=False, + ) + ) + if is_column_major: + w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2) + w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2) + assert not w13_weight_scale_inv.is_contiguous() + assert not w2_weight_scale_inv.is_contiguous() + + # Add scales to the parameter list + fml.w13_weight_scale_inv = torch.nn.Parameter( + w13_weight_scale_inv, requires_grad=False + ) + fml.w2_weight_scale_inv = torch.nn.Parameter( + w2_weight_scale_inv, requires_grad=False + ) + + return fml + + +def _test_eplb_fml(env, world_size: int, test_config: TestConfig): + # Initialize model parallel (using tensor parallel as an entrypoint + # to expert parallel) + set_env_vars_and_device(env) + + vllm_config = VllmConfig() + vllm_config.parallel_config.tensor_parallel_size = world_size + vllm_config.parallel_config.enable_expert_parallel = True + + with set_current_vllm_config(vllm_config): + ensure_model_parallel_initialized( + tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 + ) + + ep_group = get_tp_group().cpu_group + ep_rank = torch.distributed.get_rank() + + fml_layers = [ + make_fused_moe_layer(ep_rank, layer_idx, test_config) + for layer_idx in range(test_config.num_layers) + ] + rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers] + + indices = torch.zeros( + test_config.num_layers, test_config.num_experts, dtype=torch.long + ) + for lidx in range(test_config.num_layers): + indices[lidx] = torch.Tensor(range(test_config.num_experts)) + + shuffled_indices = torch.zeros_like(indices) + for lidx in range(test_config.num_layers): + shuffled_indices[lidx] = torch.randperm(test_config.num_experts) + + rearrange_expert_weights_inplace( + indices, + shuffled_indices, + rank_expert_weights, + ep_group, + is_profile=False, + ) + + num_local_experts = test_config.num_local_experts + num_global_experts = test_config.num_experts + for lidx, fml in enumerate(fml_layers): + for name, w in fml.named_parameters(): + for e in range(num_local_experts): + g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e] + ref = make_expert_weights( + layer_idx=lidx, + global_expert_idx=int(g_e.item()), + global_num_experts=num_global_experts, + tensor_shape=w[e].shape, + tensor_dtype=w[e].dtype, + tensor_device=w[e].device, + is_column_major=not w[e].is_contiguous(), + ) + assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), ( + f"w[{e}] {w[e].size()} {w[e].stride()} vs " + f"ref {ref.size()} {ref.stride()}" + ) + torch.testing.assert_close(w[e], ref) + + +@pytest.mark.parametrize("world_size", [2]) +@pytest.mark.parametrize("num_layers", [4]) +@pytest.mark.parametrize("num_experts", [16]) +@pytest.mark.parametrize("hidden_size", [256]) +@pytest.mark.parametrize("intermediate_size", [256]) +@pytest.mark.parametrize("column_major_scales", [True, False]) +def test_eplb_fml( + world_size: int, + num_layers: int, + num_experts: int, + hidden_size: int, + intermediate_size: int, + column_major_scales: bool, +): + if torch.cuda.device_count() < world_size: + pytest.skip(f"Need at least {world_size} GPUs to run the test") + + num_local_experts = num_experts // world_size + num_topk = 4 + # The dtypes are fine as we are essentially just checking data-copies + weight_dtype = torch.bfloat16 + weight_scale_dtype = torch.bfloat16 + + test_config = TestConfig( + num_layers=num_layers, + num_experts=num_experts, + num_local_experts=num_local_experts, + num_topk=num_topk, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + weight_dtype=weight_dtype, + weight_scale_dtype=weight_scale_dtype, + column_major_scales=column_major_scales, + ) + + distributed_run( + _test_eplb_fml, + world_size, + test_config, + ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b2f554efd8a6f..6619b64b2bbc0 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1391,7 +1391,48 @@ class FusedMoE(CustomOp): yield param_name def get_expert_weights(self) -> Iterable[torch.Tensor]: + def _maybe_make_contiguous( + name: str, p: torch.nn.Parameter + ) -> torch.nn.Parameter: + """ + In some cases, the last 2 dimensions (the non-expert dimensions) + of the weight scale tensor are transposed. This function + transforms the tensor (view update) so the tensor is contiguous(). + Example: A non-contiguous scale tensor, + `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to + `x_` of shape (E, 16, 32) and stride (512, 32, 1). + Note that we specifically use torch.transpose() so `x_` refers + to the same underlying memory. The tensors `x` and `x_`, pointing + to the same underlying memory make this transformation safe in the + context of EPLB. i.e. It is the same memory and just the view + is different. + Note: This function handles the "weight_scale" tensors specifically. + This could however be generalized to handle similar tensors. + """ + if p.ndim != 3: + return p + if p.is_contiguous(): + # Already contiguous. do nothing. + return p + # p is non-contiguous. We only handle the case where the last 2 + # dimensions of the scales tensor is transposed. We can handle + # other cases when they become relevant. + is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1 + if "weight_scale" not in name or not is_transposed_12: + # do nothing. + return p + + # Do not update the layer paramater as the layer's MoE operations would + # expect the parameter's tensor to the same shape / stride. Instead, + # make a new torch.nn.Parameter that is used just in the context of + # EPLB. + return torch.nn.Parameter( + torch.transpose(p.data, 1, 2), requires_grad=False + ) + weights = list(self.named_parameters()) + weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] + assert all( weight.is_contiguous() for name, weight in weights From c6fa3895e90f6daef4d223188f6b4156311f40c9 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 21 Nov 2025 22:45:00 +0000 Subject: [PATCH 002/129] [KV Connector] Fix async connector prefix cache metrics (#28585) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mark McLoughlin Co-authored-by: Nicolò Lucchesi --- tests/v1/core/test_scheduler.py | 17 +++++++++++++---- vllm/v1/core/sched/scheduler.py | 16 ++++++++-------- vllm/v1/request.py | 3 +++ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 04e738293cd77..d9a69a77c9797 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1057,7 +1057,8 @@ def test_kv_connector_basic(is_async: bool): ) -def test_external_prefix_cache_metrics(): +@pytest.mark.parametrize("is_async", [False, True]) +def test_external_prefix_cache_metrics(is_async: bool): """ Verify connector prefix cache metrics are updated correctly when the scheduler processes requests with KV connector hits. @@ -1067,7 +1068,9 @@ def test_external_prefix_cache_metrics(): NUM_MATCHED_NEW_TOKENS = 4 scheduler = create_scheduler( enable_prefix_caching=False, - use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False), + use_kv_connector=mock_kv( + matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async + ), ) # --- Prepare simple requests --- @@ -1079,9 +1082,15 @@ def test_external_prefix_cache_metrics(): num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS, ) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i - for req in requests: - scheduler.add_request(req) + if is_async: + _step_until_kv_transfer_finished(scheduler, req_ids) # --- Trigger scheduling and simulate model output --- output = scheduler.schedule() diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9195b112d8690..4cb5348cbacc3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -470,6 +470,7 @@ class Scheduler(SchedulerInterface): skipped_waiting_requests.prepend_request(request) continue + request.num_external_computed_tokens = ext_tokens num_external_computed_tokens = ext_tokens # Total computed tokens (local + external). @@ -576,9 +577,6 @@ class Scheduler(SchedulerInterface): new_computed_blocks + new_blocks, num_external_computed_tokens, ) - self._update_connector_prefix_cache_stats( - request, num_external_computed_tokens - ) # Request was already popped from self.waiting # unless it was re-added above due to new_blocks being None. @@ -590,6 +588,8 @@ class Scheduler(SchedulerInterface): request.status = RequestStatus.WAITING_FOR_REMOTE_KVS continue + self._update_connector_prefix_cache_stats(request) + req_index += 1 self.running.append(request) if self.log_stats: @@ -1380,15 +1380,13 @@ class Scheduler(SchedulerInterface): # KV Connector Related Methods ######################################################################## - def _update_connector_prefix_cache_stats( - self, request: Request, num_external_tokens: int - ) -> None: + def _update_connector_prefix_cache_stats(self, request: Request) -> None: if self.connector_prefix_cache_stats is None: return self.connector_prefix_cache_stats.record( num_tokens=request.num_tokens, - num_hits=num_external_tokens, + num_hits=request.num_external_computed_tokens, preempted=request.num_preemptions > 0, ) @@ -1571,9 +1569,11 @@ class Scheduler(SchedulerInterface): marked_invalid_block = True # Truncate the computed tokens at the first failed block request.num_computed_tokens = idx * self.block_size - total_affected_tokens += ( + num_affected_tokens = ( req_num_computed_tokens - request.num_computed_tokens ) + total_affected_tokens += num_affected_tokens + request.num_external_computed_tokens -= num_affected_tokens if is_affected: if not marked_invalid_block: diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 3d92906fbf4b1..366cdadf5a583 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -121,6 +121,9 @@ class Request: # The number of requests being preempted by the scheduler self.num_preemptions = 0 + # The number of tokens that have been computed remotely. + self.num_external_computed_tokens = 0 + self.block_hashes: list[BlockHash] = [] self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None if block_hasher is not None: From e9af6ba62ac99683139ff8d6bac87677fecf0b0c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Nov 2025 15:52:28 -0800 Subject: [PATCH 003/129] [Model Runner V2] Optimize Gumbel Sampling Kernel (#29210) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/sampler.py | 93 ++++++++++++++++------------------- 1 file changed, 43 insertions(+), 50 deletions(-) diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py index 55f98ca6bb6a3..499e9d3b1538d 100644 --- a/vllm/v1/worker/gpu/sampler.py +++ b/vllm/v1/worker/gpu/sampler.py @@ -3,10 +3,9 @@ from collections.abc import Callable import torch -import triton -import triton.language as tl from vllm.config.model import LogprobsMode +from vllm.triton_utils import tl, triton from vllm.v1.outputs import LogprobsTensors, SamplerOutput from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p from vllm.v1.worker.gpu.states import SamplingMetadata @@ -78,7 +77,10 @@ class Sampler: @triton.jit def _gumbel_sample_kernel( - sampled_ptr, + local_argmax_ptr, + local_argmax_stride, + local_max_ptr, + local_max_stride, logits_ptr, logits_stride, seeds_ptr, @@ -88,40 +90,21 @@ def _gumbel_sample_kernel( BLOCK_SIZE: tl.constexpr, ): req_idx = tl.program_id(0) + block_idx = tl.program_id(1) + block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + logits = tl.load( + logits_ptr + req_idx * logits_stride + block, + mask=mask, + other=float("-inf"), + ) + is_greedy = tl.load(is_greedy_ptr + req_idx) - - if is_greedy: - # Greedy sampling. Don't apply gumbel noise. - max_val = float("-inf") - max_idx = 0 - for i in range(0, vocab_size, BLOCK_SIZE): - block = i + tl.arange(0, BLOCK_SIZE) - mask = block < vocab_size - logits = tl.load( - logits_ptr + req_idx * logits_stride + block, - mask=mask, - other=float("-inf"), - ) - - idx = tl.argmax(logits, axis=0) - value = tl.max(logits, axis=0) - is_greater = value > max_val - max_val = tl.where(is_greater, value, max_val) - max_idx = tl.where(is_greater, i + idx, max_idx) - tl.store(sampled_ptr + req_idx, max_idx) - return - - # Random sampling. - # Calculate gumbel seed. - seed = tl.load(seeds_ptr + req_idx) - pos = tl.load(pos_ptr + req_idx) - gumbel_seed = tl.randint(seed, pos) - - max_val = float("-inf") - max_idx = 0 - for i in range(0, vocab_size, BLOCK_SIZE): - block = i + tl.arange(0, BLOCK_SIZE) - mask = block < vocab_size + if not is_greedy: + # Calculate the seed for gumbel noise. + seed = tl.load(seeds_ptr + req_idx) + pos = tl.load(pos_ptr + req_idx) + gumbel_seed = tl.randint(seed, pos) # Generate gumbel noise. r = tl.rand(gumbel_seed, block).to(tl.float64) @@ -129,16 +112,13 @@ def _gumbel_sample_kernel( gumbel_noise = gumbel_noise.to(tl.float32) # Apply gumbel noise. - logits = tl.load(logits_ptr + req_idx * logits_stride + block, mask=mask) logits = tl.where(mask, logits + gumbel_noise, float("-inf")) - # Argmax to get the sampled token. - idx = tl.argmax(logits, axis=0) - value = tl.max(logits, axis=0) - is_greater = value > max_val - max_val = tl.where(is_greater, value, max_val) - max_idx = tl.where(is_greater, i + idx, max_idx) - tl.store(sampled_ptr + req_idx, max_idx) + idx = tl.argmax(logits, axis=0) + token_id = block_idx * BLOCK_SIZE + idx + value = tl.max(logits, axis=0) + tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id) + tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value) def gumbel_sample( @@ -148,23 +128,36 @@ def gumbel_sample( pos: torch.Tensor, # [num_reqs] ) -> torch.Tensor: num_reqs, vocab_size = logits.shape - # NOTE(woosuk): Use int64 for later indexing. - sampled = torch.empty( + BLOCK_SIZE = 1024 + num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE) + local_argmax = torch.empty( num_reqs, + num_blocks, dtype=torch.int64, device=logits.device, ) - _gumbel_sample_kernel[(num_reqs,)]( - sampled, + local_max = torch.empty( + num_reqs, + num_blocks, + dtype=torch.float32, + device=logits.device, + ) + _gumbel_sample_kernel[(num_reqs, num_blocks)]( + local_argmax, + local_argmax.stride(0), + local_max, + local_max.stride(0), logits, logits.stride(0), seed, pos, is_greedy, vocab_size, - num_warps=8, - BLOCK_SIZE=16384, # type: ignore + BLOCK_SIZE=BLOCK_SIZE, ) + # NOTE(woosuk): Use int64 for later indexing. + max_block_idx = local_max.argmax(dim=-1, keepdim=True) + sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1) return sampled From 30d64662387aaa74abcee294f27b83043f2d1ae6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 21 Nov 2025 19:47:05 -0500 Subject: [PATCH 004/129] [BugFix] Fix Eagle `IndexError: list index out of range` for even `num_speculative_tokens` (#29102) Signed-off-by: Lucas Wilkinson --- tests/conftest.py | 8 ++++++++ vllm/config/compilation.py | 16 ++++++++++------ vllm/v1/spec_decode/eagle.py | 33 +++++++++++++++++++-------------- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b17081352edcf..5afdb225b8923 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -748,6 +748,14 @@ class VllmRunner: # being captured which can trigger edge cases that we don't handle yet. kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]} + # Make sure we have atleast one cudagraph large enough for a single decode. + if (speculative_config := kwargs.get("speculative_config")) and ( + num_speculative_tokens := speculative_config["num_speculative_tokens"] + ): + kwargs["compilation_config"]["cudagraph_capture_sizes"].append( + num_speculative_tokens + 1 + ) + with init_ctx: self.llm = LLM( model=model_name, diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index abdae49106120..9b5309598d0e2 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -950,14 +950,18 @@ class CompilationConfig: ) ) + if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size: + # if one valid but would be round_down use that + rounded_sizes = [multiple_of] + if len(rounded_sizes) == 0: - logger.warning( - "No valid cudagraph sizes after rounding to multiple of " - " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens" - " or max_cudagraph_capture_size (or cudagraph_capture_sizes)", - multiple_of, + raise ValueError( + f"No valid cudagraph sizes after rounding to multiple of {multiple_of} " + f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)" + f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}" + f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})" + f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})" ) - return self.max_cudagraph_capture_size = rounded_sizes[-1] self.cudagraph_capture_sizes = rounded_sizes diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 0df9cd3214e53..3de418f1d13c8 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -86,9 +86,9 @@ class EagleProposer: self.use_cuda_graph = False - compilation_config = self.vllm_config.compilation_config - if compilation_config.mode == CompilationMode.VLLM_COMPILE: - cudagraph_mode = compilation_config.cudagraph_mode + self.compilation_config = self.vllm_config.compilation_config + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: + cudagraph_mode = self.compilation_config.cudagraph_mode if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode( CUDAGraphMode.PIECEWISE ): @@ -103,13 +103,6 @@ class EagleProposer: and not self.speculative_config.enforce_eager ) - self.cudagraph_batch_sizes = ( - (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes)) - if self.use_cuda_graph - else [] - ) - - self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes) # persistent buffers for cuda graph self.input_ids = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=device @@ -276,7 +269,10 @@ class EagleProposer: per_layer_attn_metadata[layer_name] = draft_indexer_metadata cudagraph_runtime_mode = CUDAGraphMode.NONE - if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -366,7 +362,10 @@ class EagleProposer: # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] - if self.use_cuda_graph and batch_size <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and batch_size <= self.compilation_config.max_cudagraph_capture_size + ): input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -777,7 +776,10 @@ class EagleProposer: self.positions[:num_tokens] = tree_positions.view(-1) self.hidden_states[:num_tokens] = tree_hidden_states.view(num_tokens, -1) - if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -1114,7 +1116,10 @@ class EagleProposer: ) -> None: # Determine if CUDA graphs should be used for this run. cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph - if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + cudagraphs_enabled + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) with set_forward_context( From d5dbdbfcb2cfc2e4d82a1e2605576f1e4e440ca7 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Fri, 21 Nov 2025 17:10:27 -0800 Subject: [PATCH 005/129] [docs] Fix cudagraph mode config (#29170) Signed-off-by: angelayi --- docs/design/debug_vllm_compile.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md index 3b454e851b54e..8912eb58f8ac7 100644 --- a/docs/design/debug_vllm_compile.md +++ b/docs/design/debug_vllm_compile.md @@ -9,7 +9,7 @@ TL;DR: |----------|----------|-------------| | --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs | | -O.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only | -| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) | Turn off CUDAGraphs only | +| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only | | -O.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor | ## vLLM-torch.compile overview From 9a3101b2ba6821488f4b7a9b93124e479edc4d3e Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Fri, 21 Nov 2025 19:11:02 -0600 Subject: [PATCH 006/129] [Rocm][CI] Fix DeekSeek V2-Lite Accuracy CI (#29135) Signed-off-by: charlifu --- .../deepseek_v2_lite_ep_eplb.sh | 12 +++++++++++- .../qwen30b_a3b_fp8_block_ep.sh | 11 ++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 5302f524a0ae4..8106f50f18f66 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -17,7 +17,17 @@ wait_for_server() { } MODEL="deepseek-ai/DeepSeek-V2-lite" -BACKENDS=("deepep_high_throughput" "deepep_low_latency") + +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh index a5135299297e2..0d06f53a183d0 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh @@ -17,7 +17,16 @@ wait_for_server() { } MODEL="QWen/Qwen3-30B-A3B-FP8" -BACKENDS=("deepep_high_throughput" "deepep_low_latency") +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then From 1d34eb11e057f6b42af36bdb13852d2701f04245 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 21 Nov 2025 20:14:49 -0500 Subject: [PATCH 007/129] [CI] Bug: Fix triton import issue (#29202) Signed-off-by: yewentao256 --- vllm/v1/worker/gpu/block_table.py | 3 +-- vllm/v1/worker/gpu/input_batch.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py index ff24e88ede2c0..b31e9b179d26c 100644 --- a/vllm/v1/worker/gpu/block_table.py +++ b/vllm/v1/worker/gpu/block_table.py @@ -3,10 +3,9 @@ from collections.abc import Iterable import torch -import triton -import triton.language as tl from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.triton_utils import tl, triton from vllm.utils.math_utils import cdiv from vllm.v1.utils import CpuGpuBuffer diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py index 89f375649146f..8313b32d29797 100644 --- a/vllm/v1/worker/gpu/input_batch.py +++ b/vllm/v1/worker/gpu/input_batch.py @@ -7,9 +7,8 @@ import numba import numba.types as types import numpy as np import torch -import triton -import triton.language as tl +from vllm.triton_utils import tl, triton from vllm.utils import random_uuid from vllm.utils.math_utils import cdiv from vllm.v1.utils import CpuGpuBuffer From d045e22dfeee61ece1a20ac4aec8cf483a42d406 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Sat, 22 Nov 2025 01:30:55 +0000 Subject: [PATCH 008/129] [Model][Qwen3VL] Tune Triton w8a8 block fp8 kernel for L40s (#29217) Signed-off-by: Lukas Geiger --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ 4 files changed, 584 insertions(+) create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6b2c1dc1312bf --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b0eaf02a541ad --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4cd357d5086ca --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..ca2179ddf3d2f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + } +} From ed8e6843cc7167113bb9a436818f2e242c841b9f Mon Sep 17 00:00:00 2001 From: Ryan Rock Date: Fri, 21 Nov 2025 19:31:22 -0600 Subject: [PATCH 009/129] [CI/Build] Add terratorch for AMD (#29205) Signed-off-by: Ryan Rock --- requirements/rocm-test.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 432e11977872d..eabb5065bfceb 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -39,3 +39,6 @@ mteb[bm25s]>=1.38.11, <2 # Required for eval tests lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d + +# Plugins test +terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e From 5c8f2adf50e0cf2c5acf908ac796089cc45abdcf Mon Sep 17 00:00:00 2001 From: Jie Luo <65482183+Livinfly@users.noreply.github.com> Date: Sat, 22 Nov 2025 09:34:28 +0800 Subject: [PATCH 010/129] [Bugfix] Fix block size in block_table with PCP (#29094) Signed-off-by: Livinfly --- vllm/v1/worker/block_table.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 76e17f3797a1a..37ec0fb97e06b 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -84,7 +84,7 @@ class BlockTable: self.pcp_world_size = get_pcp_group().world_size self.pcp_rank = get_pcp_group().rank_in_group except AssertionError: - # DCP might not be initialized in testing + # PCP might not be initialized in testing self.pcp_world_size = 1 self.pcp_rank = 0 try: @@ -268,6 +268,11 @@ class MultiGroupBlockTable: # (max_model_len//dcp_world_size) tokens in kvcache, # so the block_size which used for calc max_num_blocks_per_req # must be multiplied by dcp_world_size. + try: + pcp_world_size = get_pcp_group().world_size + except AssertionError: + # PCP might not be initialized in testing + pcp_world_size = 1 try: dcp_world_size = get_dcp_group().world_size except AssertionError: @@ -280,12 +285,14 @@ class MultiGroupBlockTable: f"must match block_sizes length ({len(block_sizes)})" ) + total_cp_world_size = dcp_world_size * pcp_world_size + self.block_tables = [ BlockTable( block_size, max_num_reqs, max( - cdiv(max_model_len, block_size * dcp_world_size), + cdiv(max_model_len, block_size * total_cp_world_size), 1 + num_speculative_tokens, ), max_num_batched_tokens, From 1ef9c9e29480f95340e124cc7d81a2876a60516d Mon Sep 17 00:00:00 2001 From: qli88 Date: Fri, 21 Nov 2025 19:36:19 -0600 Subject: [PATCH 011/129] [CI/Build] Disable test_gptoss_tp.py in 'LoRA TP Test' group for ROCm platform (#29204) Signed-off-by: qli88 --- .buildkite/test-amd.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 4e2ff5c5a6bd5..4ee81fdabf665 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1319,7 +1319,10 @@ steps: - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py + + # Disabled for now because MXFP4 backend on non-cuda platform + # doesn't support LoRA yet + #- pytest -v -s -x lora/test_gptoss_tp.py - label: Weight Loading Multiple GPU Test # 33min From 052950e5b3c48b1189df62f833ed9cff4aabb0bd Mon Sep 17 00:00:00 2001 From: FlintyLemming Date: Sat, 22 Nov 2025 09:37:51 +0800 Subject: [PATCH 012/129] Add fused MoE config for H200 E160 N192 fp8 (#29182) Signed-off-by: FlintyLemming --- ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..54fe5374cb95d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} From 6f403501a085f4917e49e1714bdf44d2aabd06f9 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 21 Nov 2025 20:13:18 -0600 Subject: [PATCH 013/129] [CI/Build][AMD] Enable Entrypoints Integration Test (Pooling) to run without error on ROCm (#29212) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/entrypoints/pooling/correctness/test_mteb_embed.py | 6 ++++++ tests/entrypoints/pooling/correctness/test_mteb_score.py | 6 ++++++ tests/entrypoints/pooling/llm/test_embedding.py | 6 ++++++ tests/entrypoints/pooling/llm/test_encode.py | 6 ++++++ tests/entrypoints/pooling/llm/test_score.py | 6 ++++++ tests/entrypoints/pooling/openai/test_embedding.py | 6 ++++++ .../entrypoints/pooling/openai/test_embedding_dimensions.py | 6 ++++++ .../entrypoints/pooling/openai/test_embedding_long_text.py | 6 ++++++ tests/entrypoints/pooling/openai/test_rerank.py | 6 ++++++ tests/entrypoints/pooling/openai/test_score.py | 6 ++++++ tests/entrypoints/pooling/openai/test_truncation.py | 6 ++++++ 11 files changed, 66 insertions(+) diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py index 7f16638e51e2c..64673534fd32a 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_embed.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_embed.py @@ -11,6 +11,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( run_mteb_embed_task, ) from tests.utils import RemoteOpenAIServer +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py index 1afe68b189db8..81ad0097187b0 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_score.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py @@ -13,6 +13,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( run_mteb_rerank, ) from tests.utils import RemoteOpenAIServer +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py index 5455b5f91fc09..f5eab4c29ae18 100644 --- a/tests/entrypoints/pooling/llm/test_embedding.py +++ b/tests/entrypoints/pooling/llm/test_embedding.py @@ -9,6 +9,12 @@ import torch.nn.functional as F from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "intfloat/multilingual-e5-small" diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py index ca85d2758fce4..f86ecef2e4744 100644 --- a/tests/entrypoints/pooling/llm/test_encode.py +++ b/tests/entrypoints/pooling/llm/test_encode.py @@ -7,6 +7,12 @@ import pytest from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "intfloat/multilingual-e5-small" diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py index b69c6a47c1913..ce36d61cb8476 100644 --- a/tests/entrypoints/pooling/llm/test_score.py +++ b/tests/entrypoints/pooling/llm/test_score.py @@ -9,6 +9,12 @@ import torch from tests.models.utils import softmax from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index e971b23e8f1a0..0c88d800e2f99 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -19,6 +19,7 @@ from vllm.entrypoints.openai.protocol import ( EmbeddingResponse, PoolingResponse, ) +from vllm.platforms import current_platform from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils.serial_utils import ( EMBED_DTYPE_TO_TORCH_DTYPE, @@ -28,6 +29,11 @@ from vllm.utils.serial_utils import ( decode_pooling_output, ) +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) + MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DTYPE = "bfloat16" diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py index ba9fb64262772..8018dac2d3ffe 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py @@ -12,6 +12,12 @@ from tests.models.language.pooling.embed_utils import run_embedding_correctness_ from tests.models.utils import EmbedModelInfo from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py index f977c81a9084e..a9ade09dad0b5 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py +++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py @@ -16,6 +16,12 @@ import pytest_asyncio from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) def _generate_random_text(word_count: int) -> str: diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index 1d85190c12a19..5a772e22a7414 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -8,6 +8,12 @@ import torch.nn.functional as F from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py index b8f796d47efaa..ceff9d0181825 100644 --- a/tests/entrypoints/pooling/openai/test_score.py +++ b/tests/entrypoints/pooling/openai/test_score.py @@ -10,6 +10,12 @@ from torch import tensor from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import ScoreResponse +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODELS = [ {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True}, diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/openai/test_truncation.py index 6889628dc9145..0d2d385840402 100644 --- a/tests/entrypoints/pooling/openai/test_truncation.py +++ b/tests/entrypoints/pooling/openai/test_truncation.py @@ -7,6 +7,12 @@ import pytest import pytest_asyncio from tests.utils import RemoteOpenAIServer +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" max_model_len = 128 From 77e1c035d039ec546bb01b4915eed6b5735156c2 Mon Sep 17 00:00:00 2001 From: Yihua Cheng Date: Fri, 21 Nov 2025 19:18:00 -0800 Subject: [PATCH 014/129] [chore][LMCache connector] Remove useless logs from lmcache connector (#29069) Signed-off-by: ApostaC --- .../v1/lmcache_integration/multi_process_adapter.py | 1 - .../kv_transfer/kv_connector/v1/lmcache_mp_connector.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py index ab2eeed9f6b8a..6acfb73997f25 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py @@ -310,7 +310,6 @@ class LMCacheMPWorkerAdapter: request_id, result, ) - logger.info("Retrieve request for request_id=%s finished", request_id) # Remove the finished requests from the tracking dicts for request_id in finished_stores: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 22ddabbf1e352..d1d3e475cc889 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -469,9 +469,6 @@ class LMCacheMPConnector(KVConnectorBase_V1): ops.append(meta.op) if len(request_ids) > 0: - logger.info( - "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids - ) self.worker_adapter.batched_submit_retrieve_requests( request_ids, ops, event ) From fd65015a14be5f2ce663cd959dff6970285c54b4 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 21 Nov 2025 21:34:33 -0600 Subject: [PATCH 015/129] [CI/Build] Only use supported types and features on ROCm in MoE kernel tests (#29149) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/moe/test_batched_moe.py | 9 +++++++-- tests/kernels/moe/test_block_fp8.py | 5 +++++ tests/kernels/moe/test_gpt_oss_triton_kernels.py | 5 +++++ tests/kernels/moe/test_modular_kernel_combinations.py | 6 ++++++ tests/kernels/moe/test_moe_permute_unpermute.py | 6 ++++++ tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py | 6 ++++++ tests/kernels/moe/test_triton_moe_ptpc_fp8.py | 6 ++++++ 7 files changed, 41 insertions(+), 2 deletions(-) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 2285709fa7d60..dab1207d78031 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -39,6 +39,11 @@ MNK_FACTORS = [ NUM_EXPERTS = [8, 64] TOP_KS = [1, 2, 6] +DTYPES = [torch.bfloat16] + +if not current_platform.is_fp8_fnuz(): + DTYPES.append(torch.float8_e4m3fn) + vllm_config = VllmConfig() @@ -96,7 +101,7 @@ class BatchedMMTensors: @pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512]) @pytest.mark.parametrize("K", [128, 1024]) @pytest.mark.parametrize("N", [128, 1024]) -@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16]) +@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("block_shape", [None, [128, 128]]) @pytest.mark.parametrize("per_act_token_quant", [False, True]) def test_batched_mm( @@ -229,7 +234,7 @@ def test_batched_mm( @pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16]) +@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("per_act_token_quant", [False, True]) @pytest.mark.parametrize("block_shape", [None, [128, 128]]) @pytest.mark.parametrize("input_scales", [False]) diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 88db4b3e537c2..b0ff1e64e3219 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -31,6 +31,11 @@ dg_available = has_deep_gemm() if current_platform.get_device_capability() < (9, 0): pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) +if current_platform.is_fp8_fnuz(): + pytest.skip( + "Tests in this file require float8_e4m3fn and platform does not support", + allow_module_level=True, + ) vllm_config = VllmConfig() diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index af33fd4e3fc3b..98e80ec029777 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -270,6 +270,11 @@ class Case: @pytest.mark.parametrize("num_token", [2]) @pytest.mark.parametrize("tp", [1, 2, 4, 8]) def test_equiv(num_token, a_dtype, w_dtype, tp): + from triton_kernels.tensor_details import layout + + if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"): + pytest.skip("make_default_matmul_mxfp4_w_layout not available") + M = num_token E = ModelConfig.num_experts K = ModelConfig.hidden_size diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index e3b8621b452fa..2a30ef2355529 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -46,6 +46,12 @@ meets_multi_gpu_requirements = pytest.mark.skipif( reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages", ) +if current_platform.is_fp8_fnuz(): + pytest.skip( + "Tests in this file require float8_e4m3fn and platform does not support", + allow_module_level=True, + ) + def format_result(verbose, msg, ex=None): if ex is not None: diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index ba1f657b3ecda..12dd322dccc52 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -23,6 +23,12 @@ TOP_KS = [2, 6, 8] EP_SIZE = [1, 4, 16] current_platform.seed_everything(0) +if current_platform.is_rocm(): + pytest.skip( + "moe_permute_unpermute_supported is not defined for ROCm", + allow_module_level=True, + ) + def torch_permute( hidden_states: torch.Tensor, diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index d6b78dd2c2323..b220205759e2d 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -14,6 +14,12 @@ from vllm.platforms import current_platform from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm from vllm.utils.math_utils import cdiv, round_up +if current_platform.is_fp8_fnuz(): + pytest.skip( + "Tests in this file require float8_e4m3fn and platform does not support", + allow_module_level=True, + ) + fp8_dtype = torch.float8_e4m3fn CASES = [ diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py index 7a467e160b784..0ab025dceca40 100644 --- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -19,6 +19,12 @@ if current_platform.get_device_capability() < (9, 0): vllm_config = VllmConfig() +if current_platform.is_fp8_fnuz(): + pytest.skip( + "Tests in this file require float8_e4m3fn and platform does not support", + allow_module_level=True, + ) + def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16): """Matrix multiplication function that supports per-token input From 933f67ecd81231ebfa5e2434d3ae3819b6c28068 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Fri, 21 Nov 2025 19:59:07 -0800 Subject: [PATCH 016/129] [Bugfix]Fix a conditional to not check zero value (#28754) Signed-off-by: Yanan Cao --- vllm/compilation/caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index 63b7ad7279e37..6297d9f995aa4 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -116,7 +116,8 @@ class VllmSerializableFunction(SerializableCallable): the AOT compiled path. """ compile_inputs = [ - inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs) + inp if inp is not None else example_inputs[i] + for i, inp in enumerate(fn.example_inputs) ] with tracing(TracingContext(fake_mode)): fn.optimized_call = vllm_backend( From 1489902b531bb649f8110c94572b2d8b753a72cc Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 22 Nov 2025 12:01:30 +0800 Subject: [PATCH 017/129] [LoRA] Cleanup FusedMoEWithLoRA (#29187) Signed-off-by: Jee Jee Li --- vllm/lora/layers/fused_moe.py | 193 ++++++++++++------------ vllm/lora/punica_wrapper/punica_base.py | 4 +- vllm/lora/punica_wrapper/punica_gpu.py | 4 +- 3 files changed, 98 insertions(+), 103 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index adf30855cafc3..5aeaca8de5e53 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -42,6 +42,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() self.device = base_layer.w2_weight.device + self.w13_slices = 2 self._inject_lora_into_fused_moe() def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]: @@ -60,8 +61,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): def _get_lora_moe_configs( self, op_prefix: str, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, + num_loras: int, + rank: int, num_slices: int, M: int, layer: FusedMoE, @@ -69,23 +70,25 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): config_dtype: str, ): if envs.VLLM_TUNED_CONFIG_FOLDER: + hidden_size = layer.hidden_size + intermediate_size = layer.intermediate_size_per_partition shrink_config = get_lora_op_configs( op_type=f"fused_moe_lora_{op_prefix}_shrink", - max_loras=lora_a_stacked.shape[0], + max_loras=num_loras, batch=M, - hidden_size=lora_a_stacked.shape[-1], - rank=lora_a_stacked.shape[-2], + hidden_size=hidden_size, + rank=rank, num_slices=num_slices, - moe_intermediate_size=lora_b_stacked.shape[-2], + moe_intermediate_size=intermediate_size, ) expand_config = get_lora_op_configs( op_type=f"fused_moe_lora_{op_prefix}_expand", - max_loras=lora_a_stacked.shape[0], + max_loras=num_loras, batch=M, - hidden_size=lora_a_stacked.shape[-1], - rank=lora_a_stacked.shape[-2], + hidden_size=hidden_size, # lora_a_stacked.shape[-1], + rank=rank, num_slices=num_slices, - moe_intermediate_size=lora_b_stacked.shape[-2], + moe_intermediate_size=intermediate_size, # lora_b_stacked.shape[-2], ) else: # fall back to the default config get_config_func = functools.partial( @@ -152,12 +155,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_tokens = hidden_states.size(0) M = min(num_tokens, CHUNK_SIZE) - + max_lora_rank = self.w13_lora_a_stacked[0].shape[-2] shrink_config, expand_config = self._get_lora_moe_configs( op_prefix="w13", - lora_a_stacked=self.w1_lora_a_stacked, - lora_b_stacked=self.w1_lora_b_stacked, - num_slices=2, + num_loras=self.max_loras, + rank=max_lora_rank, + num_slices=self.w13_slices, M=M, layer=layer, top_k=top_k, @@ -165,7 +168,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ) # get the block size of m from customized config or default config - max_loras = self.w1_lora_a_stacked.shape[0] ( sorted_token_ids_lora, expert_ids_lora, @@ -175,7 +177,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens, shrink_config["BLOCK_SIZE_M"], self.base_layer.local_num_experts, - max_loras, + self.max_loras, self.adapter_enabled, expert_map, ) @@ -186,17 +188,15 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens_post_padded_lora ) - w13_lora_a_stacked = [self.w1_lora_a_stacked, self.w3_lora_a_stacked] - w13_lora_b_stacked = [self.w1_lora_b_stacked, self.w3_lora_b_stacked] - max_lora_rank = self.w1_lora_a_stacked.shape[-2] - expert_ids_lora = expert_ids_lora.view(max_loras, -1) - sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1) + expert_ids_lora = expert_ids_lora.view(self.max_loras, -1) + sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1) + # self.punica_wrapper.add_lora_fused_moe( input.view(-1, top_k, input.shape[-1]), hidden_states, - w13_lora_a_stacked, - w13_lora_b_stacked, + self.w13_lora_a_stacked, + self.w13_lora_b_stacked, topk_weights, sorted_token_ids_lora, expert_ids_lora, @@ -230,11 +230,11 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_tokens = hidden_states.size(0) M = min(num_tokens, CHUNK_SIZE) - + max_lora_rank = self.w2_lora_a_stacked.shape[-2] shrink_config, expand_config = self._get_lora_moe_configs( op_prefix="w2", - lora_a_stacked=self.w2_lora_a_stacked, - lora_b_stacked=self.w2_lora_b_stacked, + num_loras=self.max_loras, + rank=max_lora_rank, num_slices=1, M=M, layer=layer, @@ -247,20 +247,19 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens_post_padded_lora = moe_state_dict[ "num_tokens_post_padded_lora" ] - max_loras = self.w1_lora_a_stacked.shape[0] - expert_ids_lora = expert_ids_lora.view(max_loras, -1) - sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1) + + expert_ids_lora = expert_ids_lora.view(self.max_loras, -1) + sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1) intermediate_cache2 = moe_state_dict["intermediate_cache2"] intermediate_cache3 = args[0] - max_lora_rank = self.w2_lora_a_stacked.shape[-2] shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size) self.punica_wrapper.add_lora_fused_moe( intermediate_cache3, intermediate_cache2, - [self.w2_lora_a_stacked], - [self.w2_lora_b_stacked], + (self.w2_lora_a_stacked,), + (self.w2_lora_b_stacked,), topk_weights, sorted_token_ids_lora, expert_ids_lora, @@ -289,7 +288,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): fused_experts.moe_sum = moe_sum_decorator( self.base_layer, fused_experts.moe_sum ) - self.base_layer.quant_method = FusedMoEModularMethod( self.base_layer.quant_method, m_fused_moe_fn ) @@ -301,33 +299,42 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): model_config: PretrainedConfig | None = None, ) -> None: """Initializes lora matrices.""" + assert self.w13_slices == 2 + self.max_loras = lora_config.max_loras self.fully_sharded = lora_config.fully_sharded_loras self.adapter_enabled = torch.tensor( [0] * (max_loras + 1), dtype=torch.int, device=self.device ) - self.w1_lora_a_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - lora_config.max_lora_rank - if not self.fully_sharded - else divide(lora_config.max_lora_rank, self.tp_size), - self.base_layer.hidden_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, + self.w13_lora_a_stacked = tuple( + torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + lora_config.max_lora_rank + if not self.fully_sharded + else divide(lora_config.max_lora_rank, self.tp_size), + self.base_layer.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + for _ in range(self.w13_slices) ) - self.w1_lora_b_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - self.base_layer.intermediate_size_per_partition, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, + + self.w13_lora_b_stacked = tuple( + torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + self.base_layer.intermediate_size_per_partition, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + for _ in range(self.w13_slices) ) self.w2_lora_a_stacked = torch.zeros( @@ -353,29 +360,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): device=self.device, ) - self.w3_lora_a_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - lora_config.max_lora_rank - if not self.fully_sharded - else divide(lora_config.max_lora_rank, self.tp_size), - self.base_layer.hidden_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.w3_lora_b_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - self.base_layer.intermediate_size_per_partition, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - # They will be used by 'LoRALayerWeights.create_dummy_lora_weights' # to create a dummy LoRA weights. self.lora_a_stacked = [] @@ -383,20 +367,28 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): for lora_id in range(max_loras): for experts_id in range(self.base_layer.local_num_experts): # gate_proj,down_proj,up_proj - self.lora_a_stacked.append(self.w1_lora_a_stacked[lora_id][experts_id]) + self.lora_a_stacked.append( + self.w13_lora_a_stacked[0][lora_id][experts_id] + ) self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id]) - self.lora_a_stacked.append(self.w3_lora_a_stacked[lora_id][experts_id]) + self.lora_a_stacked.append( + self.w13_lora_a_stacked[1][lora_id][experts_id] + ) - self.lora_b_stacked.append(self.w1_lora_b_stacked[lora_id][experts_id]) + self.lora_b_stacked.append( + self.w13_lora_b_stacked[0][lora_id][experts_id] + ) self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id]) - self.lora_b_stacked.append(self.w3_lora_b_stacked[lora_id][experts_id]) + self.lora_b_stacked.append( + self.w13_lora_b_stacked[1][lora_id][experts_id] + ) def reset_lora(self, index: int): """Resets the lora weights at index back to 0.""" - self.w1_lora_a_stacked[index] = 0 - self.w1_lora_b_stacked[index] = 0 - self.w3_lora_a_stacked[index] = 0 - self.w3_lora_b_stacked[index] = 0 + for pos in range(self.w13_slices): + self.w13_lora_a_stacked[pos][index] = 0 + self.w13_lora_b_stacked[pos][index] = 0 + self.w2_lora_a_stacked[index] = 0 self.w2_lora_b_stacked[index] = 0 self.adapter_enabled[index] = 0 @@ -434,7 +426,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): if self.fully_sharded: # Based on S-LoRA, we slice W1 and W3 A along the rank dim, # and W2 B along the hidden_size dim. - w13_shard_size = self.w1_lora_a_stacked[index, eid].shape[0] + w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0] w13_start_idx = self.tp_rank * w13_shard_size w13_end_idx = (self.tp_rank + 1) * w13_shard_size w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :] @@ -444,29 +436,32 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): w2_start_idx = self.tp_rank * w2_shard_size w2_end_idx = (self.tp_rank + 1) * w2_shard_size w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :] - - self.w1_lora_a_stacked[ + # w1 lora_a + self.w13_lora_a_stacked[0][ index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1] ].copy_(w1_lora_a, non_blocking=True) - - self.w3_lora_a_stacked[ + # w3 lora_a + self.w13_lora_a_stacked[1][ index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1] ].copy_(w3_lora_a, non_blocking=True) + # w1 lora_b + self.w13_lora_b_stacked[0][ + index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1] + ].copy_(w1_lora_b, non_blocking=True) + # w3 lora_b + self.w13_lora_b_stacked[1][ + index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1] + ].copy_(w3_lora_b, non_blocking=True) + + self.w2_lora_a_stacked[ + index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1] + ].copy_(w2_lora_a, non_blocking=True) + self.w2_lora_b_stacked[ index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1] ].copy_(w2_lora_b, non_blocking=True) - self.w1_lora_b_stacked[ - index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1] - ].copy_(w1_lora_b, non_blocking=True) - self.w3_lora_b_stacked[ - index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1] - ].copy_(w3_lora_b, non_blocking=True) - self.w2_lora_a_stacked[ - index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1] - ].copy_(w2_lora_a, non_blocking=True) - @classmethod def can_replace_layer( cls, diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index 7c0fc8167711d..ce38751e4b6a7 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -470,8 +470,8 @@ class PunicaWrapperBase(PunicaWrapperABC): self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: list[torch.Tensor], - lora_b_stacked: list[torch.Tensor], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], topk_weights: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 52138ef0cc3b0..ef4b4ab7c3497 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -360,8 +360,8 @@ class PunicaWrapperGPU(PunicaWrapperBase): self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: list[torch.Tensor], - lora_b_stacked: list[torch.Tensor], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], topk_weights: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, From e9056056fbacecbac4318bd0323745fdd7fe55b6 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Nov 2025 20:21:35 -0800 Subject: [PATCH 018/129] [Model Runner V2] Limit cudagraph size to max decode batch size (#29221) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/cudagraph_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index 31a706475243c..763bd61834625 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -27,9 +27,11 @@ class CudaGraphManager: device: torch.device, ): self.vllm_config = vllm_config + self.scheduler_config = vllm_config.scheduler_config self.device = device self.max_model_len = vllm_config.model_config.max_model_len + self.max_num_reqs = self.scheduler_config.max_num_seqs self.dp_size = vllm_config.parallel_config.data_parallel_size self.compilation_config = vllm_config.compilation_config assert self.compilation_config is not None @@ -39,9 +41,11 @@ class CudaGraphManager: else: self.cudagraph_mode = self.compilation_config.cudagraph_mode if self.compilation_config.cudagraph_capture_sizes is not None: - self.cudagraph_sizes = sorted( - self.compilation_config.cudagraph_capture_sizes - ) + cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + # Limit the cudagraph sizes to the max decode batch size. + self.cudagraph_sizes = [ + x for x in cudagraph_sizes if x <= self.max_num_reqs + ] else: self.cudagraph_sizes = [] self.padded_sizes = self._init_padded_sizes() @@ -54,9 +58,10 @@ class CudaGraphManager: if not self.cudagraph_mode.has_full_cudagraphs(): # Full cuda graphs are not used. return {} + if not self.cudagraph_sizes: + return {} padded_sizes: dict[int, int] = {} - assert len(self.cudagraph_sizes) > 0 for i in range(1, self.cudagraph_sizes[-1] + 1): for x in self.cudagraph_sizes: if i <= x: From 742e9ff6b39ad0433bac0d7417a41bbdc74854a3 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Fri, 21 Nov 2025 23:42:11 -0800 Subject: [PATCH 019/129] [responsesAPI] parse reasoning item input (#28248) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia Co-authored-by: Cyrus Leung --- .../online_serving/openai_responses_client.py | 44 ++++++++++++ .../openai/test_response_api_simple.py | 71 +++++++++++++++++++ .../openai/test_response_api_with_harmony.py | 27 ++++++- tests/entrypoints/test_responses_utils.py | 58 +++++++++++++++ vllm/entrypoints/responses_utils.py | 13 ++++ 5 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 examples/online_serving/openai_responses_client.py create mode 100644 tests/entrypoints/openai/test_response_api_simple.py diff --git a/examples/online_serving/openai_responses_client.py b/examples/online_serving/openai_responses_client.py new file mode 100644 index 0000000000000..b4eb24671507a --- /dev/null +++ b/examples/online_serving/openai_responses_client.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Set up this example by starting a vLLM OpenAI-compatible server. +Reasoning models can be used through the Responses API as seen here +https://platform.openai.com/docs/api-reference/responses +For example: +vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3 + +""" + +from openai import OpenAI + +input_messages = [{"role": "user", "content": "What model are you?"}] + + +def main(): + base_url = "http://localhost:8000/v1" + client = OpenAI(base_url=base_url, api_key="empty") + model = "Qwen/Qwen3-8B" # get_first_model(client) + response = client.responses.create( + model=model, + input=input_messages, + ) + + for message in response.output: + if message.type == "reasoning": + # append reasoning message + input_messages.append(message) + + response_2 = client.responses.create( + model=model, + input=input_messages, + ) + print(response_2.output_text) + # I am Qwen, a large language model developed by Alibaba Cloud. + # I am designed to assist with a wide range of tasks, including + # answering questions, creating content, coding, and engaging in + # conversations. I can help with various topics and provide + # information or support in multiple languages. How can I assist you today? + + +if __name__ == "__main__": + main() diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py new file mode 100644 index 0000000000000..425b8199a0fd0 --- /dev/null +++ b/tests/entrypoints/openai/test_response_api_simple.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest +import pytest_asyncio +from openai import OpenAI + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-8B" + + +@pytest.fixture(scope="module") +def server(): + args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"] + env_dict = dict( + VLLM_ENABLE_RESPONSES_API_STORE="1", + # uncomment for tool calling + # PYTHON_EXECUTION_BACKEND="dangerously_use_uv", + ) + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_basic(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + ) + assert response is not None + print("response: ", response) + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_reasoning_item(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + {"type": "message", "content": "Hello.", "role": "user"}, + { + "type": "reasoning", + "id": "lol", + "content": [ + { + "type": "reasoning_text", + "text": "We need to respond: greeting.", + } + ], + "summary": [], + }, + ], + temperature=0.0, + ) + assert response is not None + assert response.status == "completed" + # make sure we get a reasoning and text output + assert response.output[0].type == "reasoning" + assert response.output[1].type == "message" + assert type(response.output[1].content[0].text) is str diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index dea8d2d28f61a..6251e1776c30a 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -35,7 +35,7 @@ GET_WEATHER_SCHEMA = { @pytest.fixture(scope="module") def server(): - args = ["--enforce-eager", "--tool-server", "demo"] + args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"] env_dict = dict( VLLM_ENABLE_RESPONSES_API_STORE="1", PYTHON_EXECUTION_BACKEND="dangerously_use_uv", @@ -550,6 +550,31 @@ def call_function(name, args): raise ValueError(f"Unknown function: {name}") +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_reasoning_item(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + {"type": "message", "content": "Hello.", "role": "user"}, + { + "type": "reasoning", + "id": "lol", + "content": [ + { + "type": "reasoning_text", + "text": "We need to respond: greeting.", + } + ], + "summary": [], + }, + ], + temperature=0.0, + ) + assert response is not None + assert response.status == "completed" + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling(client: OpenAI, model_name: str): diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py index 48bf06088bc05..91c818374e3fd 100644 --- a/tests/entrypoints/test_responses_utils.py +++ b/tests/entrypoints/test_responses_utils.py @@ -1,7 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +from openai.types.responses.response_reasoning_item import ( + Content, + ResponseReasoningItem, + Summary, +) + from vllm.entrypoints.responses_utils import ( + construct_chat_message_with_tool_call, convert_tool_responses_to_completions_format, ) @@ -28,3 +36,53 @@ class TestResponsesUtils: result = convert_tool_responses_to_completions_format(input_tool) assert result == {"type": "function", "function": input_tool} + + def test_construct_chat_message_with_tool_call(self): + item = ResponseReasoningItem( + id="lol", + summary=[], + type="reasoning", + content=[ + Content( + text="Leroy Jenkins", + type="reasoning_text", + ) + ], + encrypted_content=None, + status=None, + ) + formatted_item = construct_chat_message_with_tool_call(item) + assert formatted_item["role"] == "assistant" + assert formatted_item["reasoning"] == "Leroy Jenkins" + + item = ResponseReasoningItem( + id="lol", + summary=[ + Summary( + text='Hmm, the user has just started with a simple "Hello,"', + type="summary_text", + ) + ], + type="reasoning", + content=None, + encrypted_content=None, + status=None, + ) + + formatted_item = construct_chat_message_with_tool_call(item) + assert formatted_item["role"] == "assistant" + assert ( + formatted_item["reasoning"] + == 'Hmm, the user has just started with a simple "Hello,"' + ) + + item = ResponseReasoningItem( + id="lol", + summary=[], + type="reasoning", + content=None, + encrypted_content="TOP_SECRET_MESSAGE", + status=None, + ) + with pytest.raises(ValueError): + construct_chat_message_with_tool_call(item) diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index d966f58804b67..912e8a690573d 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -10,6 +10,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( Function as FunctionCallTool, ) from openai.types.responses import ResponseFunctionToolCall +from openai.types.responses.response_reasoning_item import ResponseReasoningItem from openai.types.responses.tool import Tool from vllm import envs @@ -37,6 +38,18 @@ def construct_chat_message_with_tool_call( ) ], ) + elif isinstance(item, ResponseReasoningItem): + reasoning_content = "" + if item.encrypted_content: + raise ValueError("Encrypted content is not supported.") + if len(item.summary) == 1: + reasoning_content = item.summary[0].text + elif item.content and len(item.content) == 1: + reasoning_content = item.content[0].text + return { + "role": "assistant", + "reasoning": reasoning_content, + } elif item.get("type") == "function_call_output": # Append the function call output as a tool message. return ChatCompletionToolMessageParam( From ea38474ac564efdc09762ad066139b75cf68f924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mads=20Kildeg=C3=A5rd?= Date: Sat, 22 Nov 2025 10:58:22 +0100 Subject: [PATCH 020/129] [Frontend][Responses API] Multi-turn (with type: "output_text") support for non-harmony requests (#29175) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mads Kildegård --- vllm/entrypoints/chat_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aaf8a3ae9d2dd..bf80856c1bbfc 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1283,6 +1283,7 @@ MM_PARSER_MAP: dict[ "text": lambda part: _TextParser(part).get("text", None), "thinking": lambda part: _ThinkParser(part).get("thinking", None), "input_text": lambda part: _TextParser(part).get("text", None), + "output_text": lambda part: _TextParser(part).get("text", None), "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None), "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), @@ -1463,7 +1464,7 @@ def _parse_chat_message_content_part( ) return None - if part_type in ("text", "input_text", "refusal", "thinking"): + if part_type in ("text", "input_text", "output_text", "refusal", "thinking"): str_content = cast(str, content) if wrap_dicts: return {"type": "text", "text": str_content} From 988ee66b0d54ec08a24135f7a947affe69e9dd52 Mon Sep 17 00:00:00 2001 From: jinghanhu Date: Sat, 22 Nov 2025 18:07:50 +0800 Subject: [PATCH 021/129] Handle triton kernel import exception (#29062) --- vllm/model_executor/layers/fused_moe/config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 21eb4d590a7d1..1826fafa8c4f5 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -28,10 +28,11 @@ logger = init_logger(__name__) if has_triton_kernels(): try: from triton_kernels.matmul_ogs import PrecisionConfig - except ImportError: + except (ImportError, AttributeError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " - "version is compatible." + "version is compatible. Error: %s", + e, ) From e6309acdba3a26e803d1ea7f66804f4ad30c2b9a Mon Sep 17 00:00:00 2001 From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com> Date: Sat, 22 Nov 2025 05:35:32 -0500 Subject: [PATCH 022/129] Simplify `from_blob` usage in `get_cuda_view_from_cpu_tensor` (#29027) Signed-off-by: Jane Xu --- csrc/cuda_view.cu | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu index 938bd4ab7fc62..9853fc942bab7 100644 --- a/csrc/cuda_view.cu +++ b/csrc/cuda_view.cu @@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) { auto strides = cpu_tensor.strides(); auto options = cpu_tensor.options().device(torch::kCUDA); - // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter, - // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the - // memory, so we don't free it here. - auto deleter = [](void*) { - // no-op, since the memory is owned by the original CPU tensor - }; - + // use default no-op deleter, since the memory is owned by the original CPU + // tensor torch::Tensor cuda_tensor = - torch::from_blob(device_ptr, sizes, strides, deleter, options); + torch::from_blob(device_ptr, sizes, strides, options); TORCH_CHECK(cuda_tensor.device().is_cuda(), "Resulting tensor is not on CUDA device"); From a4fdf2405c737843d1e95e406959f3e2e6bcf899 Mon Sep 17 00:00:00 2001 From: rasmith Date: Sat, 22 Nov 2025 04:59:39 -0600 Subject: [PATCH 023/129] [CI/Build] Skip tests that require libcudart in test_lmcache_integration.py (#29228) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- .../kv_connector/unit/test_lmcache_integration.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py index 11507d7cd4e7b..33418edc325af 100644 --- a/tests/v1/kv_connector/unit/test_lmcache_integration.py +++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py @@ -9,6 +9,12 @@ # Assumption vs. Correctness Tests: # these unit tests do *not* test correctness of LMCache-side or vLLM-side logic # it is to ensure that assumptions LMCache makes about vLLM's interface are stable + +import pytest + +from vllm.platforms import current_platform + + def assumes(obj, attr, is_callable=False, is_instance_of=None): import inspect from dataclasses import is_dataclass @@ -48,6 +54,9 @@ def assumes(obj, attr, is_callable=False, is_instance_of=None): assert isinstance(attr_value, is_instance_of), assumption_msg +@pytest.mark.skipif( + current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm" +) def test_multimodal_interface(): # protect against interface changes from vllm.multimodal.inputs import PlaceholderRange @@ -72,6 +81,9 @@ def test_multimodal_interface(): assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9] +@pytest.mark.skipif( + current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm" +) def test_config_interface(): # protect against interface changes from vllm.config import VllmConfig @@ -146,6 +158,9 @@ def test_config_interface(): ) +@pytest.mark.skipif( + current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm" +) def test_request_interface(): # protect against interface changes from types import NoneType From 8e22da1d7fcd43efd8fec18c0c0bf6a8e7cf61a6 Mon Sep 17 00:00:00 2001 From: rasmith Date: Sat, 22 Nov 2025 05:00:54 -0600 Subject: [PATCH 024/129] [CI/Build Don't add FLASHINFER backend in test_cpu_offloading.py (#29229) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/v1/kv_offload/test_cpu_offloading.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index 3ee41c40859dc..406d4c0b4c1fd 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -12,10 +12,14 @@ from tqdm import tqdm from vllm import LLM, SamplingParams, TokensPrompt from vllm.config import KVEventsConfig, KVTransferConfig from vllm.distributed.kv_events import BlockStored, KVEventBatch +from vllm.platforms import current_platform from vllm.utils.system_utils import set_env_var CPU_BLOCK_SIZES = [48] -ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"] +ATTN_BACKENDS = ["FLASH_ATTN"] + +if current_platform.is_cuda(): + ATTN_BACKENDS.append("FLASHINFER") class MockSubscriber: From 5a4802588ed8f7918468986fce130c19ee721674 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 22 Nov 2025 19:34:15 +0800 Subject: [PATCH 025/129] [Misc] Further clean up chunked prefill and prefix caching init (#29186) Signed-off-by: DarkLight1337 --- tests/engine/test_arg_utils.py | 2 +- tests/v1/core/test_scheduler.py | 19 +++++++------------ tests/v1/core/utils.py | 11 +++-------- vllm/config/cache.py | 4 ++-- vllm/engine/arg_utils.py | 24 +++++++++++++++++++----- vllm/v1/core/sched/scheduler.py | 2 +- 6 files changed, 33 insertions(+), 29 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 472b1487ef440..10827e3b4b9cd 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -279,7 +279,7 @@ def test_prefix_cache_default(): args = parser.parse_args([]) engine_args = EngineArgs.from_cli_args(args=args) - assert not engine_args.enable_prefix_caching, "prefix caching defaults to off." + assert engine_args.enable_prefix_caching, "prefix caching should default to on." # with flag to turn it on. args = parser.parse_args(["--enable-prefix-caching"]) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index d9a69a77c9797..09acde6e08faa 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -76,11 +76,11 @@ def test_get_num_unfinished_requests(): @pytest.mark.parametrize( "enable_prefix_caching, prompt_logprobs", [ - (None, None), + (False, None), (True, 5), ], ) -def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None): +def test_schedule(enable_prefix_caching: bool, prompt_logprobs: int | None): """Test scheduling. Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs """ @@ -582,12 +582,12 @@ def test_check_stop_min_tokens(): @pytest.mark.parametrize( "enable_prefix_caching, prompt_logprobs", [ - (None, None), + (False, None), (True, 5), ], ) def test_schedule_concurrent_batches( - enable_prefix_caching: bool | None, prompt_logprobs: int | None + enable_prefix_caching: bool, prompt_logprobs: int | None ): scheduler = create_scheduler( max_num_batched_tokens=1024, @@ -1425,7 +1425,7 @@ def create_scheduler_with_priority( model: str = "facebook/opt-125m", max_num_seqs: int = 16, max_num_batched_tokens: int = 8192, - enable_prefix_caching: bool | None = None, + enable_prefix_caching: bool = False, long_prefill_token_threshold: int = 0, disable_chunked_mm_input: bool = False, use_kv_connector: bool = False, @@ -1444,7 +1444,7 @@ def create_scheduler_with_priority( max_num_batch_tokens: max num tokens to batch enable_prefix_caching: optionally force APC config (True/False) or use default - (None) + (False) Returns: {class}`Scheduler` instance with priority scheduling @@ -1467,17 +1467,12 @@ def create_scheduler_with_priority( seed=42, ) # Cache config, optionally force APC - kwargs_cache = ( - {} - if enable_prefix_caching is None - else {"enable_prefix_caching": enable_prefix_caching} - ) cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, swap_space=0, cache_dtype="auto", - **kwargs_cache, + enable_prefix_caching=enable_prefix_caching, ) kv_transfer_config = ( KVTransferConfig( diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 65511c17473b2..6830f68736453 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -42,7 +42,7 @@ def create_scheduler( model: str = "facebook/opt-125m", max_num_seqs: int = 16, max_num_batched_tokens: int = 8192, - enable_prefix_caching: bool | None = None, + enable_prefix_caching: bool = False, long_prefill_token_threshold: int = 0, disable_chunked_mm_input: bool = False, use_kv_connector: None | bool | MockKVConfig = None, @@ -63,7 +63,7 @@ def create_scheduler( max_num_batch_tokens: max num tokens to batch enable_prefix_caching: optionally force APC config (True/False) or use default - (None) + (False) Returns: {class}`Scheduler` instance @@ -87,17 +87,12 @@ def create_scheduler( skip_tokenizer_init=skip_tokenizer_init, ) # Cache config, optionally force APC - kwargs_cache = ( - {} - if enable_prefix_caching is None - else {"enable_prefix_caching": enable_prefix_caching} - ) cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, swap_space=0, cache_dtype="auto", - **kwargs_cache, + enable_prefix_caching=enable_prefix_caching, ) kv_transfer_config = None if isinstance(use_kv_connector, MockKVConfig): diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 2652c7c06ad0f..ef6928d8ebd5c 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -73,8 +73,8 @@ class CacheConfig: sliding_window: int | None = None """Sliding window size for the KV cache. This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" - enable_prefix_caching: bool | None = None - """Whether to enable prefix caching. Enabled by default for V1.""" + enable_prefix_caching: bool = True + """Whether to enable prefix caching.""" prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256" """Set the hash algorithm for prefix caching:\n - "sha256" uses Pickle for object serialization before hashing.\n diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 888f57b1ac1df..611bf1b375849 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -425,7 +425,7 @@ class EngineArgs: ParallelConfig.max_parallel_loading_workers ) block_size: BlockSize | None = CacheConfig.block_size - enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching + enable_prefix_caching: bool | None = None prefix_caching_hash_algo: PrefixCachingHashAlgo = ( CacheConfig.prefix_caching_hash_algo ) @@ -1975,10 +1975,11 @@ class EngineArgs: if self.prefill_context_parallel_size > 1: default_chunked_prefill = False default_prefix_caching = False - logger.warning( + logger.warning_once( "--prefill-context-parallel-size > 1 is not compatible with " "chunked prefill and prefix caching now. Chunked prefill " - "and prefix caching have been disabled by default." + "and prefix caching have been disabled by default.", + scope="local", ) if self.enable_chunked_prefill is None: @@ -1988,15 +1989,27 @@ class EngineArgs: "%s chunked prefill by default", "Enabling" if default_chunked_prefill else "Disabling", ) + elif ( + model_config.runner_type == "generate" + and not self.enable_chunked_prefill + and default_chunked_prefill + ): + logger.warning_once( + "This model does not officially support disabling chunked prefill. " + "Disabling this manually may cause the engine to crash " + "or produce incorrect outputs.", + scope="local", + ) elif ( model_config.runner_type == "pooling" and self.enable_chunked_prefill and not default_chunked_prefill ): - logger.warning( + logger.warning_once( "This model does not officially support chunked prefill. " "Enabling this manually may cause the engine to crash " "or produce incorrect outputs.", + scope="local", ) if self.enable_prefix_caching is None: @@ -2011,10 +2024,11 @@ class EngineArgs: and self.enable_prefix_caching and not default_prefix_caching ): - logger.warning( + logger.warning_once( "This model does not officially support prefix caching. " "Enabling this manually may cause the engine to crash " "or produce incorrect outputs.", + scope="local", ) world_size = self.pipeline_parallel_size * self.tensor_parallel_size diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4cb5348cbacc3..a7ec0de372631 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -180,7 +180,7 @@ class Scheduler(SchedulerInterface): self.kv_cache_manager = KVCacheManager( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, - enable_caching=bool(self.cache_config.enable_prefix_caching), + enable_caching=self.cache_config.enable_prefix_caching, use_eagle=self.use_eagle, log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, From 6965a392a4cd38ee65ac6a9c2730e0a7c62a658d Mon Sep 17 00:00:00 2001 From: Nandan Vallamdasu Date: Sat, 22 Nov 2025 18:28:22 +0530 Subject: [PATCH 026/129] Fix: Resolve circular import in model_loader/utils.py (#29189) Signed-off-by: nandan2003 Signed-off-by: Nandan Vallamdasu Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/model_loader/utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index e74434e9d12cb..1db6337f4c9f9 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -19,12 +19,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) -from vllm.model_executor.models.adapters import ( - as_embedding_model, - as_reward_model, - as_seq_cls_model, - try_create_mm_pooling_model_cls, -) + from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal from vllm.utils.platform_utils import is_pin_memory_available @@ -172,6 +167,12 @@ _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]() def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]: + from vllm.model_executor.models.adapters import ( + as_embedding_model, + as_reward_model, + as_seq_cls_model, + try_create_mm_pooling_model_cls, + ) architectures = getattr(model_config.hf_config, "architectures", []) model_cls, arch = model_config.registry.resolve_model_cls( From 2d4978a57e0addf55cde6113e9615ed064b72fb7 Mon Sep 17 00:00:00 2001 From: yihong Date: Sat, 22 Nov 2025 21:00:04 +0800 Subject: [PATCH 027/129] fix: clean up function never use in setup.py (#29061) Signed-off-by: yihong0618 --- setup.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/setup.py b/setup.py index 5591bcb132447..8871b04d8fc46 100644 --- a/setup.py +++ b/setup.py @@ -74,18 +74,6 @@ def is_ninja_available() -> bool: return which("ninja") is not None -def is_url_available(url: str) -> bool: - from urllib.request import urlopen - - status = None - try: - with urlopen(url) as f: - status = f.status - except Exception: - return False - return status == 200 - - class CMakeExtension(Extension): def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None: super().__init__(name, sources=[], py_limited_api=True, **kwa) @@ -533,28 +521,6 @@ def get_nvcc_cuda_version() -> Version: return nvcc_cuda_version -def get_gaudi_sw_version(): - """ - Returns the driver version. - """ - # Enable console printing for `hl-smi` check - output = subprocess.run( - "hl-smi", - shell=True, - text=True, - capture_output=True, - env={"ENABLE_CONSOLE": "true"}, - ) - if output.returncode == 0 and output.stdout: - return ( - output.stdout.split("\n")[2] - .replace(" ", "") - .split(":")[1][:-1] - .split("-")[0] - ) - return "0.0.0" # when hl-smi is not available - - def get_vllm_version() -> str: # Allow overriding the version. This is useful to build platform-specific # wheels (e.g. CPU, TPU) without modifying the source. From 5f7209a793ec553889f8ba9972a0034393a6b196 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Sat, 22 Nov 2025 08:00:50 -0500 Subject: [PATCH 028/129] [tiny] Remove unsupported TRITON_MLA backend from batch invariance (#28832) Signed-off-by: Bram Wasti Signed-off-by: Bram Wasti Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/model_executor/layers/batch_invariant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index bec7af0286345..8b33727f05fbc 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -805,11 +805,11 @@ def override_envs_for_invariance(): "FLASH_ATTN", # best supported backend "FLASHINFER", "FLASH_ATTN_MLA", - "TRITON_MLA", # Not yet supported MLA backends # "FLASHMLA", # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967 + # "TRITON_MLA", ] if curr_attn_backend not in supported_backends: warning = ( From 066209a045216c87bd582be97830eae728a29369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 22 Nov 2025 15:38:44 +0100 Subject: [PATCH 029/129] [Attention] Refactor FA `block_size` limitations to hybrid models only (#29084) Signed-off-by: NickLucche --- tests/v1/attention/test_mla_backends.py | 2 +- tests/v1/worker/test_gpu_model_runner.py | 4 ++- vllm/attention/backends/abstract.py | 10 ++++--- vllm/v1/attention/backends/flash_attn.py | 27 ++++++++++++++----- vllm/v1/attention/backends/flashinfer.py | 12 ++++----- vllm/v1/attention/backends/mla/cutlass_mla.py | 5 +++- .../attention/backends/mla/flashattn_mla.py | 5 +++- .../attention/backends/mla/flashinfer_mla.py | 5 +++- vllm/v1/attention/backends/mla/flashmla.py | 5 +++- .../attention/backends/mla/flashmla_sparse.py | 5 +++- vllm/v1/attention/backends/mla/indexer.py | 6 ++--- .../attention/backends/mla/rocm_aiter_mla.py | 4 ++- vllm/v1/attention/backends/rocm_aiter_fa.py | 5 +++- vllm/v1/attention/backends/tree_attn.py | 5 +++- vllm/v1/attention/backends/triton_attn.py | 5 +++- vllm/v1/attention/backends/xformers.py | 5 +++- vllm/v1/worker/gpu_model_runner.py | 4 +-- 17 files changed, 82 insertions(+), 32 deletions(-) diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 1bd05e6183dc2..783e02ce89bdb 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -61,7 +61,7 @@ for backend in BACKENDS_TO_TEST: BACKEND_BLOCK_SIZES = {} for backend in BACKENDS_TO_TEST: - supported_sizes = backend.get_class().supported_kernel_block_sizes + supported_sizes = backend.get_class().get_supported_kernel_block_sizes() if supported_sizes: default_size = supported_sizes[0] block_size = ( diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 01c1364f7ee62..d0f1b703fcb92 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -185,7 +185,9 @@ def _make_mock_backend_for_kernel_block_size( supported_sizes: list[int | MultipleOf], ): class _MockBackend: - supported_kernel_block_sizes = supported_sizes + @staticmethod + def get_supported_kernel_block_sizes(): + return supported_sizes return _MockBackend() diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 67ded88475243..bd7e81b15bfc3 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -46,9 +46,12 @@ class AttentionBackend(ABC): # makes sure the output tensor is allocated inside the cudagraph. accept_output_buffer: bool = False supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)] supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(1)] + @staticmethod @abstractmethod def get_name() -> str: @@ -142,10 +145,11 @@ class AttentionBackend(ABC): if block_size not in valid_sizes: return False - if not cls.supported_kernel_block_sizes: + supported_kernel_block_sizes = cls.get_supported_kernel_block_sizes() + if not supported_kernel_block_sizes: return True - for supported_size in cls.supported_kernel_block_sizes: + for supported_size in supported_kernel_block_sizes: if isinstance(supported_size, MultipleOf): supported_size = supported_size.base # With hybrid_blocks feature, the framework-level block size diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 9fa6b1dfd19dd..a9a4af5ac1183 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -32,7 +32,7 @@ if is_flash_attn_varlen_func_available(): get_scheduler_metadata, reshape_and_cache_flash, ) -from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.config import VllmConfig, get_current_vllm_config, get_layers_from_vllm_config from vllm.config.cache import CacheDType from vllm.distributed.parallel_state import get_dcp_group from vllm.logger import init_logger @@ -56,11 +56,26 @@ logger = init_logger(__name__) class FlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - # NOTE(tdoublep): while in principle, FA supports - # MultipleOf(16), these are the block sizes that do not - # suffer from the NaN propagation problem described here: - # https://github.com/Dao-AILab/flash-attention/issues/1974 - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + vllm_config = get_current_vllm_config() + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + if ( + model_config + and model_config.is_hybrid + and ( + cache_config.mamba_ssm_cache_dtype == "float32" + or cache_config.mamba_cache_dtype == "float32" + ) + ): + # NOTE(tdoublep): while in principle, FA supports + # MultipleOf(16), these are the block sizes that do not + # suffer from the NaN propagation problem described here: + # https://github.com/Dao-AILab/flash-attention/issues/1974 + return [16, 32, 64] + return [MultipleOf(16)] @staticmethod def get_name() -> str: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index e3f499216d7f1..8159f4096107f 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -16,7 +16,6 @@ from flashinfer import ( from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache from flashinfer.prefill import trtllm_batch_context_with_kv_cache from flashinfer.utils import FP4Tensor -from typing_extensions import override from vllm import envs from vllm.attention.backends.abstract import ( @@ -275,10 +274,6 @@ class BatchDCPPrefillWrapper: class FlashInferBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - # Note: Not sure for all platforms, - # but on Blackwell, only support a page size of - # 16, 32, 64 - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", @@ -286,6 +281,12 @@ class FlashInferBackend(AttentionBackend): "fp8_e5m2", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + # Note: Not sure for all platforms, but on Blackwell, + # only support a page size of 16, 32, 64. + return [16, 32, 64] + @staticmethod def get_name() -> str: return "FLASHINFER" @@ -566,7 +567,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) @classmethod - @override def get_cudagraph_support( cls: type["FlashInferMetadataBuilder"], vllm_config: VllmConfig, diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 60cb5022a55eb..5e3fbc0abf083 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -36,13 +36,16 @@ class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): class CutlassMLABackend(MLACommonBackend): supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [128] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", "fp8_e4m3", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [128] + @staticmethod def get_name() -> str: return "CUTLASS_MLA" diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 12639edc8b9a1..d369814c10b6f 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -41,9 +41,12 @@ logger = init_logger(__name__) class FlashAttnMLABackend(MLACommonBackend): supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] + @staticmethod def get_name() -> str: return "FLASH_ATTN_MLA" diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index 52bb19e039e45..f02a4bb1ef35a 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -35,13 +35,16 @@ class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): class FlashInferMLABackend(MLACommonBackend): supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [32, 64] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", "fp8_e4m3", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [32, 64] + @staticmethod def get_name() -> str: return "FLASHINFER_MLA" diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 3aab1f9bb7fb6..74a4cd8430250 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -39,13 +39,16 @@ logger = init_logger(__name__) class FlashMLABackend(MLACommonBackend): supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", "fp8_e4m3", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [64] + @staticmethod def get_name() -> str: return "FLASHMLA" diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 3f2cc8c38327e..1eee1d225293b 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -55,9 +55,12 @@ structured as: class FlashMLASparseBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "fp8_ds_mla"] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [64] + @staticmethod def get_name() -> str: return "FLASHMLA_SPARSE" diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index d38361e0fcbf8..77f1ba00d5b04 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -24,9 +24,9 @@ logger = init_logger(__name__) class DeepseekV32IndexerBackend(AttentionBackend): - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [ - 1 if current_platform.is_rocm() else 64 - ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [1 if current_platform.is_rocm() else 64] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 6ccc1a341d56c..56f9c7a281e7f 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -21,7 +21,9 @@ from vllm.v1.kv_cache_interface import AttentionSpec class AiterMLABackend(MLACommonBackend): - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [1] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [1] @staticmethod def get_name() -> str: diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index ea611848b0e81..c8742e9835203 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -447,7 +447,10 @@ class AiterFlashAttentionMetadataBuilder( class AiterFlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 1bf38ed225a4c..523f759e05a21 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -31,7 +31,10 @@ logger = init_logger(__name__) class TreeAttentionBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 09c36043c8c86..d051a89f03bb4 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -154,7 +154,6 @@ class TritonAttentionBackend(AttentionBackend): torch.bfloat16, torch.float32, ] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", @@ -162,6 +161,10 @@ class TritonAttentionBackend(AttentionBackend): "fp8_e5m2", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] + @staticmethod def get_name() -> str: return "TRITON_ATTN" diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index d15d79417cc61..5039c44b9c3e6 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -42,7 +42,10 @@ logger = init_logger(__name__) class XFormersAttentionBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e786cd8bc7c97..298bb1ef5f6fd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4618,7 +4618,7 @@ class GPUModelRunner( """ for backend in backends: is_supported = False - for supported_size in backend.supported_kernel_block_sizes: + for supported_size in backend.get_supported_kernel_block_sizes(): if isinstance(supported_size, int): if block_size == supported_size: is_supported = True @@ -4649,7 +4649,7 @@ class GPUModelRunner( all_int_supported_sizes = set( supported_size for backend in backends - for supported_size in backend.supported_kernel_block_sizes + for supported_size in backend.get_supported_kernel_block_sizes() if isinstance(supported_size, int) ) From d44a63c6d6e1a545aff270b3b85cf231ef779dab Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 22 Nov 2025 06:41:25 -0800 Subject: [PATCH 030/129] [BugFix] Fix returned logprobs with spec decode + prefill chunking (#29216) Signed-off-by: Nick Hill --- tests/v1/sample/test_logprobs.py | 13 +++++++++---- vllm/v1/sample/sampler.py | 5 ++++- vllm/v1/worker/gpu_model_runner.py | 19 +++++++++---------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 42584938bc06f..c0b0e1ea226ed 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -521,8 +521,8 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode): pytest.param( ( "eagle", - "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", + "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama3_2_1B_speculator.eagle3", ), marks=large_gpu_mark(min_gb=32), ), @@ -541,7 +541,7 @@ def test_spec_decode_logprobs( """ from vllm import LLM - prompt = "Hello world" + prompt = "Hello world " * 50 sampling_params = SamplingParams( temperature=0, logprobs=3, max_tokens=10, ignore_eos=False ) @@ -582,6 +582,9 @@ def test_spec_decode_logprobs( seed=42, logprobs_mode=logprobs_mode, gpu_memory_utilization=0.4, + # Force prefill chunking + enable_chunked_prefill=True, + max_num_batched_tokens=32, ) spec_results = spec_llm.generate([prompt], sampling_params) # Collect logprobs outputs from spec decode LLM. @@ -597,6 +600,8 @@ def test_spec_decode_logprobs( # Per-token logprobs are expected to be the same. assert len(ref_logprobs) == len(spec_logprobs) for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs): - assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3) + assert math.isclose( + ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1 + ) assert ref_logprob.rank == spec_logprob.rank assert ref_logprob.decoded_token == spec_logprob.decoded_token diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 39c63fe31ad2c..c75b4f0543c0d 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -81,7 +81,10 @@ class Sampler(nn.Module): if logprobs_mode == "raw_logprobs": raw_logprobs = self.compute_logprobs(logits) elif logprobs_mode == "raw_logits": - raw_logprobs = logits.clone() + if logits.dtype == torch.float32: + raw_logprobs = logits.clone() + else: + raw_logprobs = logits.to(torch.float32) # Use float32 for the logits. logits = logits.to(torch.float32) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 298bb1ef5f6fd..979f977587038 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2466,7 +2466,9 @@ class GPUModelRunner( num_sampled_tokens = sampler_output.sampled_token_ids.shape[0] sampled_token_ids = sampler_output.sampled_token_ids + logprobs_tensors = sampler_output.logprobs_tensors invalid_req_indices = [] + cu_num_new_tokens: list[int] | None = None if not self.use_async_scheduling: # Get the valid generated tokens. max_gen_len = sampled_token_ids.shape[-1] @@ -2479,6 +2481,12 @@ class GPUModelRunner( sampled_token_ids, self.input_batch.vocab_size, ) + if logprobs_tensors: + # Needed for extracting logprobs when spec decoding. + # This must be done prior to discarding sampled tokens. + cu_num_new_tokens = [0] + for toks in valid_sampled_token_ids: + cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks)) # Mask out the sampled tokens that should not be sampled. for i in discard_sampled_tokens_req_indices: valid_sampled_token_ids[int(i)].clear() @@ -2506,10 +2514,6 @@ class GPUModelRunner( # the sampled tokens back, because there's no direct communication # between the first-stage worker and the last-stage worker. req_ids = self.input_batch.req_ids - logprobs_tensors = sampler_output.logprobs_tensors - cu_num_accepted_tokens = ( - [0] if spec_decode_metadata and logprobs_tensors else None - ) for req_idx in range(num_sampled_tokens): if self.use_async_scheduling: sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None @@ -2518,11 +2522,6 @@ class GPUModelRunner( num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0 - if cu_num_accepted_tokens is not None: - cu_num_accepted_tokens.append( - cu_num_accepted_tokens[-1] + num_sampled_ids - ) - if not sampled_ids: continue @@ -2544,7 +2543,7 @@ class GPUModelRunner( req_state.output_token_ids.extend(sampled_ids) logprobs_lists = ( - logprobs_tensors.tolists(cu_num_accepted_tokens) + logprobs_tensors.tolists(cu_num_new_tokens) if not self.use_async_scheduling and logprobs_tensors is not None else None ) From ae66818379fc2403e43c47154a98170aa7cab192 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 22 Nov 2025 22:48:01 +0800 Subject: [PATCH 031/129] [Misc] Fix pre-commit (#29238) Signed-off-by: DarkLight1337 --- vllm/model_executor/model_loader/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 1db6337f4c9f9..2021b68b8a60b 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) - from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal from vllm.utils.platform_utils import is_pin_memory_available @@ -173,6 +172,7 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], as_seq_cls_model, try_create_mm_pooling_model_cls, ) + architectures = getattr(model_config.hf_config, "architectures", []) model_cls, arch = model_config.registry.resolve_model_cls( From d84d8f4429a5246a9d9f179b47fac7e13801710d Mon Sep 17 00:00:00 2001 From: ZiTian Zhao Date: Sat, 22 Nov 2025 22:48:59 +0800 Subject: [PATCH 032/129] Fix EVS crash when using `video_embeds` inputs in Qwen2.5-VL (#29232) Signed-off-by: zitian.zhao Co-authored-by: Cyrus Leung --- vllm/model_executor/models/qwen2_5_vl.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8e3c0e84dfe51..1500a437613cc 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema): - hidden_size must match the hidden size of language model backbone. - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w) format + - second_per_grid_ts: The video time interval (in seconds) for each + grid along the temporal dimension in the 3D position IDs. Returned + when `videos` is not `None`. """ type: Literal["video_embeds"] @@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema): TensorShape("nv", 3), ] + second_per_grid_ts: Annotated[ + torch.Tensor | None, + TensorShape("nv"), + ] = None + Qwen2_5_VLVideoInputs: TypeAlias = ( Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs @@ -1311,6 +1319,7 @@ class Qwen2_5_VLForConditionalGeneration( type="video_embeds", video_embeds=video_embeds, video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, ) def _process_image_input( @@ -1422,7 +1431,13 @@ class Qwen2_5_VLForConditionalGeneration( # Cast to long to match the original code # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa - second_per_grid_ts = video_input["second_per_grid_ts"].long() + second_per_grid_ts = video_input.get("second_per_grid_ts") + if second_per_grid_ts is None: + raise ValueError( + "second_per_grid_ts is required when video_pruning_rate > 0 " + "is enabled for video inputs, including the video_embeds path." + ) + second_per_grid_ts = second_per_grid_ts.long() tokens_per_second = self.config.vision_config.tokens_per_second video_embeds_out = [] From f55c76c2b3270bb45072c05d6d53460c373b2172 Mon Sep 17 00:00:00 2001 From: Federico <65908512+coval3nte@users.noreply.github.com> Date: Sat, 22 Nov 2025 17:42:48 +0100 Subject: [PATCH 033/129] chore: add RTX_PRO_6000 GLM4.6-FP8 kernel tuning (#29240) --- ...ackwell_Server_Edition,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8b78f87e7f73b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} From 730bd35378bf2a5b56b6d3a45be28b3092d26519 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Sat, 22 Nov 2025 17:04:36 +0000 Subject: [PATCH 034/129] [perf][cpu] Accelerate paged attention GEMMs (QK, PV) on Arm CPUs with NEON (#29193) Signed-off-by: Fadi Arafeh --- csrc/cpu/cpu_attn.cpp | 17 ++ csrc/cpu/cpu_attn_impl.hpp | 8 +- csrc/cpu/cpu_attn_neon.hpp | 386 +++++++++++++++++++++++++ vllm/engine/arg_utils.py | 3 +- vllm/v1/attention/backends/cpu_attn.py | 7 +- 5 files changed, 416 insertions(+), 5 deletions(-) create mode 100644 csrc/cpu/cpu_attn_neon.hpp diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index 50f17c758c148..92f8bee5a47a0 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -13,6 +13,18 @@ #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX: #endif +#ifdef __aarch64__ + #include "cpu_attn_neon.hpp" + #define NEON_DISPATCH(...) \ + case cpu_attention::ISA::NEON: { \ + using attn_impl = cpu_attention::AttentionImpl; \ + return __VA_ARGS__(); \ + } +#else + #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON: +#endif // #ifdef __aarch64__ + #define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \ case HEAD_DIM: { \ constexpr size_t head_dim = HEAD_DIM; \ @@ -41,6 +53,7 @@ [&] { \ switch (ISA_TYPE) { \ AMX_DISPATCH(__VA_ARGS__) \ + NEON_DISPATCH(__VA_ARGS__) \ case cpu_attention::ISA::VEC: { \ using attn_impl = \ cpu_attention::AttentionImpl class AttentionImpl {}; @@ -143,6 +143,12 @@ struct AttentionMetadata { case ISA::VEC: ss << "VEC, "; break; + case ISA::VEC16: + ss << "VEC16, "; + break; + case ISA::NEON: + ss << "NEON, "; + break; } ss << "workitem_group_num: " << workitem_group_num << ", reduction_item_num: " << reduction_item_num diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp new file mode 100644 index 0000000000000..827f0cfbc718e --- /dev/null +++ b/csrc/cpu/cpu_attn_neon.hpp @@ -0,0 +1,386 @@ +#ifndef CPU_ATTN_NEON_HPP +#define CPU_ATTN_NEON_HPP + +#include "cpu_attn_impl.hpp" +#include +#include +namespace cpu_attention { + +namespace { + +#define BLOCK_SIZE_ALIGNMENT 32 +#define HEAD_SIZE_ALIGNMENT 32 +#define MAX_Q_HEAD_NUM_PER_ITER 16 + +// These do not use vectorized class for loading / converting +// because csrc/cpu/cpu_types_arm.hpp does not have fallback options +// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that +// doesn't support BF16. +// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency. +template +FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0, + float32x4_t& b1); + +template <> +FORCE_INLINE void load_row8_B_as_f32(const float* p, float32x4_t& b0, + float32x4_t& b1) { + b0 = vld1q_f32(p + 0); + b1 = vld1q_f32(p + 4); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::Half* p, + float32x4_t& b0, + float32x4_t& b1) { + const float16_t* h = reinterpret_cast(p); + float16x8_t v = vld1q_f16(h); + b0 = vcvt_f32_f16(vget_low_f16(v)); + b1 = vcvt_f32_f16(vget_high_f16(v)); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::BFloat16* p, + float32x4_t& b0, + float32x4_t& b1) { + const uint16_t* u = reinterpret_cast(p); +#ifdef ARM_BF16_SUPPORT + uint16x8_t u0 = vld1q_u16(u); + bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0); + b0 = vcvtq_low_f32_bf16(bf0); + b1 = vcvtq_high_f32_bf16(bf0); +#else + uint16x8_t x0 = vld1q_u16(u); + uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16); + uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16); + b0 = vreinterpretq_f32_u32(lo); + b1 = vreinterpretq_f32_u32(hi); +#endif +} + +// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs +// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2) +// #FMLAs = (K // 4) * (4 * 2 * M) +// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads +template +FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4( + const float* __restrict A, // [M x K], + const kv_cache_t* __restrict B, // [K x 8], + float* __restrict C, // [M x 8], + int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) { + // kernel supports max M of 8, as it'd spill for larger M + static_assert(1 <= M && M <= 8, "M must be in [1,8]"); + +// helpers for per-M codegen +#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7) +#define IF_M(i) if constexpr (M > (i)) + + // A row base pointers +#define DECL_A(i) const float* a##i = A + (i) * lda; + ROWS_APPLY(DECL_A) +#undef DECL_A + + // declare 2 accumulators per row of M +#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1; + ROWS_APPLY(DECL_ACC) +#undef DECL_ACC + + // initialize accumulators +#define INIT_ACC(i) \ + IF_M(i) { \ + if (accumulate) { \ + acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \ + acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \ + } else { \ + acc##i##_0 = vdupq_n_f32(0.f); \ + acc##i##_1 = vdupq_n_f32(0.f); \ + } \ + } + ROWS_APPLY(INIT_ACC) +#undef INIT_ACC + + int32_t k = 0; + + // K unrolled by 4 + for (; k + 3 < K; k += 4) { + // load A[k..k+3] for each active row (M) +#define LOAD_A4(i) \ + float32x4_t a##i##v; \ + IF_M(i) a##i##v = vld1q_f32(a##i + k); + ROWS_APPLY(LOAD_A4) +#undef LOAD_A4 + + // helper: FMA lane L from aiv +#define FMAS_LANE(i, aiv, L) \ + IF_M(i) { \ + acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \ + acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \ + } + + // k + 0 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 0) * ldb, b0, b1); +#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0) + ROWS_APPLY(STEP_K0) +#undef STEP_K0 + } + // k + 1 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 1) * ldb, b0, b1); +#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1) + ROWS_APPLY(STEP_K1) +#undef STEP_K1 + } + // k + 2 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 2) * ldb, b0, b1); +#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2) + ROWS_APPLY(STEP_K2) +#undef STEP_K2 + } + // k + 3 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 3) * ldb, b0, b1); +#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3) + ROWS_APPLY(STEP_K3) +#undef STEP_K3 + } +#undef FMAS_LANE + } + + // K tail + for (; k < K; ++k) { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)k * ldb, b0, b1); +#define TAIL_ROW(i) \ + IF_M(i) { \ + float32x4_t ai = vdupq_n_f32(*(a##i + k)); \ + acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \ + acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \ + } + ROWS_APPLY(TAIL_ROW) +#undef TAIL_ROW + } + + // store accumulators to C +#define STORE_ROW(i) \ + IF_M(i) { \ + vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \ + vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \ + } + ROWS_APPLY(STORE_ROW) +#undef STORE_ROW + +#undef ROWS_APPLY +#undef IF_M +} + +template +FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A, + const kv_cache_t* __restrict B, + float* __restrict C, int32_t M, + int32_t K, int64_t lda, + int64_t ldb, int64_t ldc, + bool accumulate) { + // micro kernel is Mx8 + static_assert(N % 8 == 0, "N must be a multiple of 8"); + for (int32_t m = 0; m < M;) { + int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1; + const float* Ab = A + m * lda; + float* Cb = C + m * ldc; + + for (int32_t n = 0; n < N; n += 8) { + const kv_cache_t* Bn = B + n; + float* Cn = Cb + n; + switch (mb) { + case 8: + gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 4: + gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 2: + gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + default: + gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + } + } + // no tail loop for N as it's guaranteed to be a multiple of 8 + m += mb; + } +} + +template +class TileGemmNeonFMLA { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, + float* __restrict__ a_tile, + kv_cache_t* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + if constexpr (phase == AttentionGemmPhase::QK) { + gemm_macro_neon_fmla_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c); + } else { + gemm_macro_neon_fmla_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc, + accum_c); + } + } +}; + +} // namespace + +// this is similar to "ISA::VEC" at the moment +template +class AttentionImpl { + public: + using query_t = scalar_t; + using q_buffer_t = float; + using kv_cache_t = scalar_t; + using logits_buffer_t = float; + using partial_output_buffer_t = float; + using prob_buffer_t = float; + + constexpr static int64_t BlockSizeAlignment = + BLOCK_SIZE_ALIGNMENT; // KV token num unit of QK and PV phases + constexpr static int64_t HeadDimAlignment = + HEAD_SIZE_ALIGNMENT; // headdim num unit of PV phase + constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER; + constexpr static int64_t HeadDim = head_dim; + constexpr static ISA ISAType = ISA::NEON; + constexpr static bool scale_on_logits = false; // apply scale on q_buffer + + static_assert(HeadDim % HeadDimAlignment == 0); + // the gemm micro kernel is Mx8 + static_assert(HeadDimAlignment % 8 == 0); + static_assert(BlockSizeAlignment % 8 == 0); + + public: + template