From c3e0e9337ef0af04d2d18b263a6a0f7deed75856 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 31 Jul 2025 18:26:11 -0400 Subject: [PATCH 01/54] [Feature] Add Flashinfer MoE Support for Compressed Tensor NVFP4 (#21639) Signed-off-by: yewentao256 --- .../compressed_tensors_moe.py | 53 +++++- .../layers/quantization/modelopt.py | 150 +++-------------- .../quantization/utils/flashinfer_fp4_moe.py | 154 ++++++++++++++++++ .../quantization/utils/nvfp4_moe_support.py | 59 +++++++ 4 files changed, 287 insertions(+), 129 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py create mode 100644 vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 17b41e8a1c23c..09d8890888fa8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -17,9 +17,14 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa + FlashInferCutlassMoEPrepareAndFinalize) from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP) from vllm.model_executor.layers.quantization.utils import replace_parameter +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + build_flashinfer_fp4_cutlass_moe_kernel, + flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( check_moe_marlin_supports_layer, marlin_make_workspace_new, marlin_moe_permute_scales) @@ -28,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( prepare_moe_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( - cutlass_fp4_supported, swizzle_blockscale) + swizzle_blockscale) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs @@ -96,8 +101,14 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): def __init__(self): - self.use_marlin = not cutlass_fp4_supported() + from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 + detect_nvfp4_moe_support) + _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) + self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported + self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass + self.use_marlin = _nvfp4.use_marlin self.group_size = 16 + self.fused_experts = None # type: ignore[assignment] def create_weights(self, layer: torch.nn.Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -200,6 +211,14 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w2_weight = torch.nn.Parameter(layer.w2_weight_packed.data, requires_grad=False) + # reorder GEMM1 weights and block scales for FlashInfer CUTLASS kernel. + if self.allow_flashinfer_cutlass: + w, s = reorder_w1w3_to_w3w1(layer.w13_weight.data, + layer.w13_weight_scale.data, + dim=-2) + layer.w13_weight = torch.nn.Parameter(w, requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(s, requires_grad=False) + if not torch.allclose(layer.w13_weight_global_scale[:, 0], layer.w13_weight_global_scale[:, 1]): logger.warning_once( @@ -246,6 +265,21 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): layer.w2_input_scale_quant = torch.nn.Parameter( (layer.w2_input_global_scale), requires_grad=False) + def maybe_swap_experts_impl(self, moe_parallel_config): + if not self.allow_flashinfer_cutlass: + return + self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( + moe_parallel_config) + + def select_gemm_impl(self, prepare_finalize, moe): + """Return the appropriate GEMM experts implementation.""" + assert moe is not None and prepare_finalize is not None + from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 + select_nvfp4_gemm_impl) + + return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe, + logger) + def apply( self, layer: torch.nn.Module, @@ -303,10 +337,23 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): global_num_experts=global_num_experts, expert_map=expert_map) + # FlashInfer fused experts path + if self.fused_experts is not None: + return flashinfer_fp4_cutlass_moe_forward( + self.fused_experts, + layer, + x, + topk_weights, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + assert expert_map is None, ("Expert Parallelism / expert_map " "is currently not supported for " "CompressedTensorsW4A4MoeMethod.") - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( cutlass_moe_fp4) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index b8ffcf90c022b..0334a2824512d 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -10,11 +10,8 @@ from torch.nn.parameter import Parameter import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant -from vllm.distributed import get_ep_group from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig -from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 - FlashInferCutlassMoEPrepareAndFinalize) from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, @@ -23,6 +20,9 @@ from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + build_flashinfer_fp4_cutlass_moe_kernel, + flashinfer_fp4_cutlass_moe_forward, reorder_w1w3_to_w3w1) from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( apply_flashinfer_per_tensor_scale_fp8, rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31) @@ -35,7 +35,6 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, requantize_with_max_scale) from vllm.model_executor.parameter import (ModelWeightParameter, PerTensorScaleParameter) -from vllm.platforms import current_platform from vllm.scalar_type import scalar_types from vllm.utils.flashinfer import has_flashinfer_moe @@ -869,28 +868,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def __init__(self, quant_config: ModelOptNvFp4Config): self.quant_config = quant_config - self.cutlass_nvfp4_supported = cutlass_fp4_supported() - self.use_marlin = False - self.allow_flashinfer_cutlass = False - - if envs.VLLM_USE_FLASHINFER_MOE_FP4: - if self.cutlass_nvfp4_supported and current_platform.is_cuda() \ - and current_platform.is_device_capability(100): - logger.info_once( - "Using FlashInfer kernels for ModelOptNvFp4FusedMoE.") - self.allow_flashinfer_cutlass = True - else: - logger.warning_once( - "Flashinfer CUTLASS Fused MoE not supported " - "or found on the current platform.") - - if not self.cutlass_nvfp4_supported: - if is_fp4_marlin_supported(): - self.use_marlin = True - else: - raise ValueError("Current platform does not support NVFP4" - " quantization. Please use Blackwell and" - " above.") + from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import ( # noqa: E501 + detect_nvfp4_moe_support) + _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__) + self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported + self.allow_flashinfer_cutlass = _nvfp4.allow_flashinfer_cutlass + self.use_marlin = _nvfp4.use_marlin self.fused_experts = None # type: ignore @@ -900,29 +883,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): ): if not self.allow_flashinfer_cutlass: return - - logger.debug_once("FlashInferExperts") - # default to TP/EP case only - - experts_kwargs: dict[str, Any] = { - "use_nvfp4_w4a4": True, - "use_dp": moe_parallel_config.dp_size > 1, - "ep_rank": moe_parallel_config.ep_rank, - "ep_size": moe_parallel_config.ep_size, - "tp_rank": moe_parallel_config.tp_rank, - "tp_size": moe_parallel_config.tp_size, - } - - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - FlashInferExperts) - experts = FlashInferExperts(**experts_kwargs) - self.fused_experts = mk.FusedMoEModularKernel( - FlashInferCutlassMoEPrepareAndFinalize( - quant_dtype=torch.uint8, - #meaning 2x e2m1 packed in one, kernel requirement - ), - experts, - ) + self.fused_experts = build_flashinfer_fp4_cutlass_moe_kernel( + moe_parallel_config) # This method update self.fused_experts # only prepare_finalize is not None call select_gemm_impl @@ -931,32 +893,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def select_gemm_impl(self, prepare_finalize, moe) -> mk.FusedMoEPermuteExpertsUnpermute: - assert moe is not None - assert prepare_finalize is not None - experts = None - all2all_manager = get_ep_group().device_communicator.all2all_manager - assert all2all_manager is not None - if self.allow_flashinfer_cutlass: - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - FlashInferExperts) - logger.debug_once("Using FlashInferExperts") - experts = FlashInferExperts( - use_nvfp4_w4a4=True, - use_dp=moe.moe_parallel_config.dp_size > 1, - ep_rank=moe.moe_parallel_config.ep_rank, - ep_size=moe.moe_parallel_config.ep_size, - tp_rank=moe.moe_parallel_config.tp_rank, - tp_size=moe.moe_parallel_config.tp_size, - ) - else: - assert moe.dp_size > 1 - logger.debug_once("Using CutlassExpertsFp4") - # Currently CutlassExpertsFp4 doesn't support DP - raise ValueError("CutlassExpertsFp4 doesn't support DP. " - "Use flashinfer CUTLASS FusedMoE backend instead " - "(set VLLM_USE_FLASHINFER_MOE_FP4=1)") + assert moe is not None and prepare_finalize is not None + from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( # noqa: E501 + select_nvfp4_gemm_impl) - return experts + return select_nvfp4_gemm_impl(self.allow_flashinfer_cutlass, moe, + logger) def uses_weight_scale_2_pattern(self) -> bool: """ @@ -1062,18 +1004,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): gemm1_weight_scale = layer.w13_weight_scale.data if self.allow_flashinfer_cutlass: - dim = -2 - size = gemm1_weight.size(dim) - assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}" - half = size // 2 - - # Reorder weight - w1, w3 = gemm1_weight.split(half, dim=dim) - gemm1_weight = torch.cat([w3, w1], dim=dim).contiguous() - - # Reorder scale - s1, s3 = gemm1_weight_scale.split(half, dim=dim) - gemm1_weight_scale = torch.cat([s3, s1], dim=dim).contiguous() + gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1( + gemm1_weight, gemm1_weight_scale, dim=-2) layer.w13_weight = Parameter(gemm1_weight, requires_grad=False) layer.w13_weight_scale = Parameter(gemm1_weight_scale, @@ -1217,49 +1149,15 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): expert_map=expert_map, apply_router_weight_on_input=apply_router_weight_on_input) else: - # TP or DP case - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 - is_valid_flashinfer_cutlass_fused_moe) - assert is_valid_flashinfer_cutlass_fused_moe( - x, layer.w13_weight, layer.w2_weight), ( - "Flashinfer CUTLASS Fused MoE not applicable!") - - a1_gscale = layer.w13_input_scale_quant - a2_gscale = layer.w2_input_scale_quant - extra_expert_args = { - 'g1_alphas': layer.g1_alphas, - 'g2_alphas': layer.g2_alphas, - 'out_dtype': x.dtype, - # Avoid confusion with a1_scale and a2_scale - # where are batch size related. - 'a1_gscale': a1_gscale, - 'a2_gscale': a2_gscale, - } - extra_prepare_args = { - 'use_dp': layer.dp_size > 1, - 'local_tokens': x.shape[0], - 'a1_gscale': a1_gscale, - } - extra_finalize_args = { - 'use_dp': layer.dp_size > 1, - 'local_tokens': x.shape[0], - } - - out = self.fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=False, # TODO(shuw): fix later, now output is high prec + out = flashinfer_fp4_cutlass_moe_forward( + self.fused_experts, + layer, + x, + topk_weights, + topk_ids, activation=activation, global_num_experts=global_num_experts, expert_map=expert_map, - w1_scale=layer.w13_blockscale_swizzled, - w2_scale=layer.w2_blockscale_swizzled, apply_router_weight_on_input=apply_router_weight_on_input, - extra_expert_args=extra_expert_args, - extra_prepare_args=extra_prepare_args, - extra_finalize_args=extra_finalize_args, ) return out diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py new file mode 100644 index 0000000000000..4c617e226041f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility helpers for NVFP4 + FlashInfer fused-MoE path""" +from __future__ import annotations + +from typing import Optional + +import torch + +import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, is_valid_flashinfer_cutlass_fused_moe) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 + FlashInferCutlassMoEPrepareAndFinalize) +from vllm.platforms import current_platform + +logger = init_logger(__name__) + +__all__ = [ + "is_flashinfer_fp4_cutlass_moe_available", + "reorder_w1w3_to_w3w1", + "build_flashinfer_fp4_cutlass_moe_kernel", + "flashinfer_fp4_cutlass_moe_forward", +] + + +def is_flashinfer_fp4_cutlass_moe_available() -> bool: + """Return ``True`` when FlashInfer CUTLASS NV-FP4 kernels can be used.""" + return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and current_platform.is_cuda() + and current_platform.is_device_capability(100)) + + +def reorder_w1w3_to_w3w1(weight: torch.Tensor, + scale: torch.Tensor, + dim: int = -2) -> tuple[torch.Tensor, torch.Tensor]: + """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`""" + size = weight.size(dim) + assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}" + half = size // 2 + + w1, w3 = weight.split(half, dim=dim) + s1, s3 = scale.split(half, dim=dim) + + return (torch.cat([w3, w1], + dim=dim).contiguous(), torch.cat([s3, s1], + dim=dim).contiguous()) + + +def build_flashinfer_fp4_cutlass_moe_kernel( + moe_parallel_config: FusedMoEParallelConfig, ) -> mk.FusedMoEModularKernel: + """Create *and return* a FlashInfer CUTLASS fused-MoE modular kernel""" + experts = FlashInferExperts( + use_nvfp4_w4a4=True, + use_dp=moe_parallel_config.dp_size > 1, + ep_rank=moe_parallel_config.ep_rank, + ep_size=moe_parallel_config.ep_size, + tp_rank=moe_parallel_config.tp_rank, + tp_size=moe_parallel_config.tp_size, + ) + logger.debug_once("FlashInferExperts (util)") + return mk.FusedMoEModularKernel( + FlashInferCutlassMoEPrepareAndFinalize(quant_dtype=torch.uint8), + experts, + ) + + +def flashinfer_fp4_cutlass_moe_forward( + fused_experts: mk.FusedMoEModularKernel, + layer: torch.nn.Module, + x: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, +) -> torch.Tensor: + """Common forward wrapper for FlashInfer NV-FP4 fused-MoE""" + + assert is_valid_flashinfer_cutlass_fused_moe( + x, layer.w13_weight, + layer.w2_weight), ("FlashInfer CUTLASS fused-MoE not applicable!") + + a1_gscale = layer.w13_input_scale_quant + a2_gscale = layer.w2_input_scale_quant + + extra_expert_args = { + "g1_alphas": layer.g1_alphas, + "g2_alphas": layer.g2_alphas, + # Avoid confusion with a1_scale and a2_scale + # where are batch size related. + "a1_gscale": a1_gscale, + "a2_gscale": a2_gscale, + "out_dtype": x.dtype, + } + extra_prepare_args = { + "use_dp": layer.dp_size > 1, + "local_tokens": x.shape[0], + "a1_gscale": a1_gscale, + } + extra_finalize_args = { + "use_dp": layer.dp_size > 1, + "local_tokens": x.shape[0], + } + + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, # TODO(shuw): fix later, now output is high prec + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=layer.w13_blockscale_swizzled, + w2_scale=layer.w2_blockscale_swizzled, + apply_router_weight_on_input=apply_router_weight_on_input, + extra_expert_args=extra_expert_args, + extra_prepare_args=extra_prepare_args, + extra_finalize_args=extra_finalize_args, + ) + + +def select_nvfp4_gemm_impl( + allow_flashinfer_cutlass: bool, + moe, # FusedMoEConfig + logger): + """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers""" + + # lazy import + from vllm.distributed import get_ep_group + + all2all_manager = get_ep_group().device_communicator.all2all_manager + assert all2all_manager is not None + + if allow_flashinfer_cutlass: + logger.debug_once("Using FlashInferExperts") + return FlashInferExperts( + use_nvfp4_w4a4=True, + use_dp=moe.moe_parallel_config.dp_size > 1, + ep_rank=moe.moe_parallel_config.ep_rank, + ep_size=moe.moe_parallel_config.ep_size, + tp_rank=moe.moe_parallel_config.tp_rank, + tp_size=moe.moe_parallel_config.tp_size, + ) + + # native cutlass experts currently don't support DP; TP case won't call this + raise ValueError( + "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS " + "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)") diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py new file mode 100644 index 0000000000000..23a749467f193 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( + is_flashinfer_fp4_cutlass_moe_available) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + is_fp4_marlin_supported) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + cutlass_fp4_supported) + +__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"] + +_logger = init_logger(__name__) + + +@dataclass(frozen=True) +class NvFp4Support: + """Result container for NV-FP4 capability probing.""" + + cutlass_supported: bool + allow_flashinfer_cutlass: bool + use_marlin: bool + + +def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support: + """Detect platform support for NV-FP4 fused-MoE path""" + cutlass_supported = cutlass_fp4_supported() + + allow_flashinfer = (cutlass_supported + and is_flashinfer_fp4_cutlass_moe_available()) + + if allow_flashinfer: + _logger.info_once("Using FlashInfer kernels for %s.", class_name + or "NVFP4 path") + else: + if envs.VLLM_USE_FLASHINFER_MOE_FP4: + _logger.warning_once( + "FlashInfer kernels unavailable for %s on current platform.", + class_name or "NVFP4 path", + ) + + use_marlin = False + if not cutlass_supported: + if is_fp4_marlin_supported(): + use_marlin = True + _logger.info_once("Falling back to Marlin FP4 MoE kernel.") + else: + raise ValueError( + "Current platform does not support NVFP4 quantization. " + "Please use Blackwell GPUs or enable FlashInfer.") + + return NvFp4Support( + cutlass_supported=cutlass_supported, + allow_flashinfer_cutlass=allow_flashinfer, + use_marlin=use_marlin, + ) From e360316ab9902ecfc564710ae4b1539db867efd9 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 31 Jul 2025 21:01:55 -0400 Subject: [PATCH 02/54] Add DeepGEMM to Dockerfile in vllm-base image (#21533) Signed-off-by: Matthew Bonanni Signed-off-by: mgoin Co-authored-by: mgoin --- docker/Dockerfile | 30 +++++++++++++++++-- tests/kernels/moe/test_deepep_deepgemm_moe.py | 5 ++-- tests/kernels/moe/test_deepgemm.py | 6 ++-- vllm/utils/deep_gemm.py | 12 ++++++++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 69aeee67a4300..413151b3edb00 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,3 @@ - # The vLLM Dockerfile is used to construct vLLM image that can be directly used # to run the OpenAI compatible server. @@ -16,6 +15,7 @@ ARG PYTHON_VERSION=3.12 # Example: # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +# TODO: Restore to base image after FlashInfer AOT wheel fixed ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 # By parameterizing the Deadsnakes repository URL, we allow third-party to use @@ -289,7 +289,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ #################### vLLM installation IMAGE #################### # image with vLLM installed -# TODO: Restore to base image after FlashInfer AOT wheel fixed FROM ${FINAL_BASE_IMAGE} AS vllm-base ARG CUDA_VERSION ARG PYTHON_VERSION @@ -435,6 +434,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/build.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') +# Install DeepGEMM from source +ARG DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git" +ARG DEEPGEMM_GIT_REF="187656694f7f69e3e7975617a68bc3387680a7e1" +RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH' + . /etc/environment + CUDA_MAJOR="${CUDA_VERSION%%.*}" + CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}" + CUDA_MINOR="${CUDA_MINOR%%.*}" + if [ "$CUDA_MAJOR" -ge 12 ] && [ "$CUDA_MINOR" -ge 8 ]; then + git clone --recursive --shallow-submodules \ + ${DEEPGEMM_GIT_REPO} deepgemm + echo "🏗️ Building DeepGEMM" + pushd deepgemm + git checkout ${DEEPGEMM_GIT_REF} + # Build DeepGEMM + # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh) + rm -rf build dist + rm -rf *.egg-info + python3 setup.py bdist_wheel + uv pip install --system dist/*.whl + popd + rm -rf deepgemm + else + echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})" + fi +BASH + #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 074771e49a061..266f1161a684b 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -20,7 +20,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEModularKernel) from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm -from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_used, + is_deep_gemm_supported) from .parallel_utils import ProcessGroupInfo, parallel_launch from .utils import make_test_weights @@ -46,7 +47,7 @@ requires_deep_ep = pytest.mark.skipif( ) requires_deep_gemm = pytest.mark.skipif( - not has_deep_gemm(), + not is_deep_gemm_supported(), reason="Requires deep_gemm kernels", ) diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index f7578e226917d..759d2814eefb9 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -15,13 +15,13 @@ import torch from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) -from vllm.utils import has_deep_gemm -from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8 +from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported, + per_block_cast_to_fp8) BLOCK_SIZE = [128, 128] requires_deep_gemm = pytest.mark.skipif( - not has_deep_gemm(), + not is_deep_gemm_supported(), reason="Requires deep_gemm kernels", ) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 169b083017e46..a49a59bd81253 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -17,6 +17,17 @@ from vllm.platforms import current_platform from vllm.utils import has_deep_gemm +@functools.cache +def is_deep_gemm_supported() -> bool: + """Return ``True`` if DeepGEMM is supported on the current platform. + Currently, only Hopper and Blackwell GPUs are supported. + """ + supported_arch = current_platform.is_cuda() and ( + current_platform.is_device_capability(90) + or current_platform.is_device_capability(100)) + return has_deep_gemm() and supported_arch + + @functools.cache def is_blackwell_deep_gemm_used() -> bool: """Return ``True`` if vLLM is configured to use DeepGEMM on a @@ -142,4 +153,5 @@ __all__ = [ "fp8_m_grouped_gemm_nt_masked", "per_block_cast_to_fp8", "is_blackwell_deep_gemm_used", + "is_deep_gemm_supported", ] From 0bd409cf01c37bbc99a5d3c70c4954da2113aba8 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 31 Jul 2025 21:02:11 -0400 Subject: [PATCH 03/54] Move flashinfer-python to optional extra `vllm[flashinfer]` (#21959) Signed-off-by: mgoin --- requirements/cuda.txt | 4 +--- setup.py | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 5557c868acafa..75008dc20df48 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -11,6 +11,4 @@ torchaudio==2.7.1 # These must be updated alongside torch torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # https://github.com/facebookresearch/xformers/releases/tag/v0.0.31 -xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 -# FlashInfer should be updated together with the Dockerfile -flashinfer_python==0.2.9rc2 \ No newline at end of file +xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7 \ No newline at end of file diff --git a/setup.py b/setup.py index 6d615d122d69e..bfa195d4395f0 100644 --- a/setup.py +++ b/setup.py @@ -671,7 +671,9 @@ setup( ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"], "audio": ["librosa", "soundfile", "mistral_common[audio]"], # Required for audio processing - "video": [] # Kept for backwards compatibility + "video": [], # Kept for backwards compatibility + # FlashInfer should be updated together with the Dockerfile + "flashinfer": ["flashinfer-python==0.2.9rc2"], }, cmdclass=cmdclass, package_data=package_data, From 37006420134fdd771b474bda32516cde209e0f4c Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 31 Jul 2025 21:13:27 -0400 Subject: [PATCH 04/54] [Refactor] Remove Duplicate `per_block_cast_to_fp8`, Remove Dependencies of DeepGEMM (#21787) Signed-off-by: yewentao256 --- .../benchmark_fp8_block_dense_gemm.py | 45 ++------------- .../kernels/moe/modular_kernel_tools/utils.py | 31 +--------- .../kernels/moe/test_cutlass_grouped_gemm.py | 21 +------ tests/kernels/moe/test_deepgemm.py | 8 ++- tests/kernels/moe/utils.py | 4 +- tests/kernels/quant_utils.py | 19 ------- tests/kernels/quantization/test_block_fp8.py | 2 +- vllm/utils/deep_gemm.py | 57 ++++++++++++------- 8 files changed, 55 insertions(+), 132 deletions(-) diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index 43c54d56ca8c1..b99c2099f2c38 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -4,49 +4,16 @@ # ruff: noqa: E501 import time -# Import DeepGEMM functions -import deep_gemm import torch -from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor -# Import vLLM functions from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + get_col_major_tma_aligned_tensor, per_token_group_quant_fp8, w8a8_block_fp8_matmul, ) from vllm.triton_utils import triton - - -# Copied from -# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9 -def per_token_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Convert tensor to FP8 format with per-token scaling.""" - assert x.dim() == 2 and x.size(1) % 128 == 0 - m, n = x.shape - x_view = x.view(m, -1, 128) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - return (x_view * (448.0 / x_amax.unsqueeze(2))).to( - torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1) - - -# Copied from -# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17 -def per_block_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Convert tensor to FP8 format with per-block scaling.""" - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), - dtype=x.dtype, - device=x.device) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - x_amax / 448.0).view(x_view.size(0), x_view.size(2)) +from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8 def benchmark_shape(m: int, @@ -69,14 +36,14 @@ def benchmark_shape(m: int, # Pre-quantize B for all implementations # (weights can be pre-quantized offline) - B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B) - B_vllm, B_scale_vllm = per_block_cast_to_fp8(B) + B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) + B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True) # Block size configuration block_size = [128, 128] # Pre-quantize A for all implementations - A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A) + A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) @@ -85,7 +52,7 @@ def benchmark_shape(m: int, # === DeepGEMM Implementation === def deepgemm_gemm(): - deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm), + fp8_gemm_nt((A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm) return C_deepgemm diff --git a/tests/kernels/moe/modular_kernel_tools/utils.py b/tests/kernels/moe/modular_kernel_tools/utils.py index 09bb4a34f3189..866f52882beee 100644 --- a/tests/kernels/moe/modular_kernel_tools/utils.py +++ b/tests/kernels/moe/modular_kernel_tools/utils.py @@ -1,10 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import math import torch import vllm._custom_ops as ops +from vllm.utils.deep_gemm import per_block_cast_to_fp8 def per_token_cast_to_fp8( @@ -20,29 +20,6 @@ def per_token_cast_to_fp8( return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) -def per_block_cast_to_fp8( - x: torch.Tensor, block_size_k: int, - block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros( - ( - int(math.ceil(m / block_size_k)) * block_size_k, - int(math.ceil(n / block_size_n)) * block_size_n, - ), - dtype=x.dtype, - device=x.device, - ) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, block_size_k, - x_padded.size(1) // block_size_k, block_size_n) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() - scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) - return x_scaled_sub, scales - - def make_non_quant_weights( e: int, n: int, @@ -99,11 +76,9 @@ def make_block_quant_fp8_weights( for i in range(e): w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i], - block_size_k=block_k, - block_size_n=block_n) + block_size=[block_k, block_n]) w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i], - block_size_k=block_k, - block_size_n=block_n) + block_size=[block_k, block_n]) return w1, w2, w1_s, w2_s diff --git a/tests/kernels/moe/test_cutlass_grouped_gemm.py b/tests/kernels/moe/test_cutlass_grouped_gemm.py index 67984fe7319a3..1aee1ed8c3762 100644 --- a/tests/kernels/moe/test_cutlass_grouped_gemm.py +++ b/tests/kernels/moe/test_cutlass_grouped_gemm.py @@ -12,10 +12,8 @@ import torch from tests.kernels.utils import baseline_scaled_mm from vllm import _custom_ops as ops from vllm.platforms import current_platform - - -def cdiv(a, b): - return (a + b - 1) // b +from vllm.utils import cdiv +from vllm.utils.deep_gemm import per_block_cast_to_fp8 def per_token_cast_to_fp8( @@ -32,21 +30,6 @@ def per_token_cast_to_fp8( return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) -def per_block_cast_to_fp8( - x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((cdiv(m, 128) * 128, cdiv(n, 128) * 128), - device=x.device, - dtype=x.dtype) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(dtype=torch.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( - x_amax / 448.0).view(x_view.size(0), x_view.size(2)) - - @pytest.mark.parametrize("num_groups, expected_m_per_group, k, n", [ (4, 8192, 7168, 4096), (4, 8192, 2048, 7168), diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index 759d2814eefb9..b6ea4ee2324c9 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -69,8 +69,12 @@ def make_block_quant_fp8_weights( dtype=torch.float32) for i in range(e): - w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i]) - w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i]) + w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i], + block_size=block_size, + use_ue8m0=True) + w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i], + block_size=block_size, + use_ue8m0=True) return w1, w2, w1_s, w2_s diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index df89ad7e6da6f..c33134981acc0 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -5,8 +5,7 @@ from typing import Optional import torch import vllm._custom_ops as ops -from tests.kernels.quant_utils import (per_block_cast_to_fp8, - per_block_cast_to_int8) +from tests.kernels.quant_utils import per_block_cast_to_int8 from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts) @@ -15,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) from vllm.utils import round_up +from vllm.utils.deep_gemm import per_block_cast_to_fp8 def triton_moe( diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 6f43d1111c98e..01a1ad2e7a0a5 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -222,25 +222,6 @@ def native_per_token_group_quant_int8(x, DEFAULT_BLOCK_SHAPE = [128, 128] -def per_block_cast_to_fp8( - x: torch.Tensor, - block_shape: list[int] = DEFAULT_BLOCK_SHAPE, -) -> tuple[torch.Tensor, torch.Tensor]: - block_m, block_n = block_shape - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((round_up(m, block_m), round_up(n, block_n)), - dtype=x.dtype, - device=x.device) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() - scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) - return x_scaled_sub, scales - - def per_block_cast_to_int8( x: torch.Tensor, block_shape: list[int] = DEFAULT_BLOCK_SHAPE, diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index 26aa8d652e639..d9154d3fd7f33 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -117,7 +117,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1]) - B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32) + B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size) As = As_fp8.to(torch.float32) Bs = Bs_fp8.to(torch.float32) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index a49a59bd81253..4dedee2a3f862 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -14,7 +14,7 @@ import torch import vllm.envs as envs from vllm.platforms import current_platform -from vllm.utils import has_deep_gemm +from vllm.utils import cdiv, has_deep_gemm @functools.cache @@ -37,7 +37,7 @@ def is_blackwell_deep_gemm_used() -> bool: return False _lazy_init() - if _per_block_cast_impl is None: + if _fp8_gemm_nt_impl is None: return False return (current_platform.is_cuda() @@ -63,18 +63,15 @@ def _resolve_symbol(module, new: str, old: str) -> Callable[..., Any] | None: _fp8_gemm_nt_impl: Callable[..., Any] | None = None _grouped_impl: Callable[..., Any] | None = None _grouped_masked_impl: Callable[..., Any] | None = None -_per_block_cast_impl: Callable[..., Any] | None = None def _lazy_init() -> None: """Import deep_gemm and resolve symbols on first use.""" - global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl, \ - _per_block_cast_impl + global _fp8_gemm_nt_impl, _grouped_impl, _grouped_masked_impl # fast path if (_fp8_gemm_nt_impl is not None or _grouped_impl is not None - or _grouped_masked_impl is not None - or _per_block_cast_impl is not None): + or _grouped_masked_impl is not None): return if not has_deep_gemm(): @@ -90,14 +87,6 @@ def _lazy_init() -> None: _grouped_masked_impl = _resolve_symbol( _dg, "fp8_m_grouped_gemm_nt_masked", "m_grouped_gemm_fp8_fp8_bf16_nt_masked") - # Try to get per_token_cast_to_fp8 from DeepGEMM math utils. - try: - _math_mod = importlib.import_module( - "deep_gemm.utils.math") # type: ignore - _per_block_cast_impl = getattr(_math_mod, "per_block_cast_to_fp8", - None) - except ModuleNotFoundError: - _per_block_cast_impl = None def fp8_gemm_nt(*args, **kwargs): @@ -121,13 +110,37 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs): return _grouped_masked_impl(*args, **kwargs) -def per_block_cast_to_fp8(x, *args, **kwargs): - _lazy_init() - if _per_block_cast_impl is not None and is_blackwell_deep_gemm_used(): - return _per_block_cast_impl(x, use_ue8m0=True) - # TODO: refactor the `per_block_cast_to_fp8` from tests to vllm utils - from tests.kernels.quant_utils import per_block_cast_to_fp8 as _pbcf - return _pbcf(x, *args, **kwargs) +def _ceil_to_ue8m0(x: torch.Tensor): + return torch.pow(2.0, torch.ceil(torch.log2(x.abs()))) + + +def _align(x: int, y: int) -> int: + return cdiv(x, y) * y + + +DEFAULT_BLOCK_SIZE = [128, 128] + + +# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/dd6ed14acbc7445dcef224248a77ab4d22b5f240/deep_gemm/utils/math.py#L38 +# TODO(wentao): optimize this function, using triton or cuda kernel +def per_block_cast_to_fp8( + x: torch.Tensor, + block_size: list[int] = DEFAULT_BLOCK_SIZE, + use_ue8m0: bool = False) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + block_m, block_n = block_size + x_padded = torch.zeros((_align(m, block_m), _align(n, block_n)), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + sf = x_amax / 448.0 + sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf + x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view( + x_view.size(0), x_view.size(2)) def calc_diff(x: torch.Tensor, y: torch.Tensor): From ad57f23f6a528ab01066998b41796a44340fd43d Mon Sep 17 00:00:00 2001 From: Charent <19562666+charent@users.noreply.github.com> Date: Fri, 1 Aug 2025 10:48:13 +0800 Subject: [PATCH 05/54] [Bugfix] Fix: Fix multi loras with tp >=2 and LRU cache (#20873) Signed-off-by: charent <19562666+charent@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 1 + tests/lora/test_multi_loras_with_tp.py | 158 +++++++++++++++++++++++++ vllm/lora/layers.py | 8 +- 3 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 tests/lora/test_multi_loras_with_tp.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2f6cc45be77e6..598fd5762985e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -804,6 +804,7 @@ steps: # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_multi_loras_with_tp.py - label: Weight Loading Multiple GPU Test # 33min diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_multi_loras_with_tp.py new file mode 100644 index 0000000000000..fe9bd3f269515 --- /dev/null +++ b/tests/lora/test_multi_loras_with_tp.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Script to test multi loras service with tp >= 2 +""" +from tests.utils import multi_gpu_test +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +MODEL_PATH = "Qwen/Qwen3-0.6B" +LORA_NAME_PATH_MAP = { + "Alice": "charent/self_cognition_Alice", + "Bob": "charent/self_cognition_Bob", + "Cat": "charent/self_cognition_Bob", # same as Bob +} + +LORA_NAME_ID_MAP = {} +INCREASE_LORA_ID = 0 +LORA_RANK = 8 + +LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"] +LORA_TEST_EXPECTED = [ + "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501 + "I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501 +] + + +def format_chatml_messages(prompt: str): + return [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": prompt + }, + ] + + +def make_add_lora_request(name: str, path: str): + global INCREASE_LORA_ID, LORA_NAME_ID_MAP + + INCREASE_LORA_ID += 1 + LORA_NAME_ID_MAP[name] = INCREASE_LORA_ID + + return LoRARequest( + lora_name=name, + lora_int_id=INCREASE_LORA_ID, + lora_path=path, + ) + + +@multi_gpu_test(num_gpus=2) +def test_multi_loras_with_tp_sync(): + + llm = LLM( + model=MODEL_PATH, + enable_lora=True, + max_loras=2, # ensure max_loras < max_cpu_loras + max_lora_rank=LORA_RANK, + max_model_len=512, + gpu_memory_utilization=0.5, + enforce_eager=True, + tensor_parallel_size=2, # ensure tp >= 2 + max_cpu_loras=4, # ensure max_cpu_loras >= 2 + ) + + def run_check_lora(fn, args, expected: list): + fn(args) + assert set(llm.llm_engine.list_loras()) == set(expected) + + # simulate add loras with CLI args + # likes: `--lora-modules Alice=/path/to/Alice Bob=/path/to/Bob` + run_check_lora( + llm.llm_engine.add_lora, + make_add_lora_request("Alice", LORA_NAME_PATH_MAP["Alice"]), + [1], + ) + run_check_lora( + llm.llm_engine.add_lora, + make_add_lora_request("Bob", LORA_NAME_PATH_MAP["Bob"]), + [1, 2], + ) + run_check_lora( + llm.llm_engine.add_lora, + make_add_lora_request("Cat", LORA_NAME_PATH_MAP["Cat"]), + [1, 2, 3], + ) + + # set temperature = 0 for greedy search + sampling_params = SamplingParams(temperature=0, max_tokens=64) + + def call_llm_get_outputs(prompt: str, lora_name: str): + lora_request = LoRARequest( + lora_name=lora_name, + lora_int_id=LORA_NAME_ID_MAP[lora_name], + lora_path=LORA_NAME_PATH_MAP[lora_name], + ) + messages = format_chatml_messages(prompt) + outputs = llm.chat( + [messages], + sampling_params, + chat_template_kwargs={ + "enable_thinking": False + }, # for those loras, ensure enable_thinking=False + lora_request=lora_request, + use_tqdm=False, + ) + output_text = outputs[0].outputs[0].text + return output_text + + def reload_lora(name: str): + """ + reload a lora to simulate the case: + setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` + for dynamic lora loading and unloading + """ + remove_lora_response = llm.llm_engine.remove_lora( + lora_id=LORA_NAME_ID_MAP[name]) + + add_lora_response = llm.llm_engine.add_lora( + make_add_lora_request(name, LORA_NAME_PATH_MAP[name])) + + print(f"{remove_lora_response=}, {add_lora_response=}") + + def check_outputs(outputs: str, expected: str): + print(f"{prompt=}.\n{expected_output=}\n{output_text=}") + print("\n----------------------------\n") + assert outputs == expected + + for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED): + + output_text = call_llm_get_outputs(prompt, "Alice") + check_outputs(output_text, expected_output) + + # call Bob, ignore what it is output + call_llm_get_outputs(prompt, "Bob") + print("After call Bob:") + + # call Alice + output_text = call_llm_get_outputs(prompt, "Alice") + check_outputs(output_text, expected_output) + + # reload Bob Lora + reload_lora("Bob") + print("After reload Bob:") + + # call Alice + output_text = call_llm_get_outputs(prompt, "Alice") + check_outputs(output_text, expected_output) + + # reload Alice Lora + reload_lora("Alice") + print("After reload Alice:") + + output_text = call_llm_get_outputs(prompt, "Alice") + check_outputs(output_text, expected_output) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index c3512ec3dbd43..de5933d6d41e5 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -682,12 +682,14 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): def slice_lora_b( self, lora_b: list[Union[torch.Tensor, None]] ) -> list[Union[torch.Tensor, None]]: + sliced_lora_b = [None] * self.n_slices for i, (shard_id, shard_size) in enumerate( zip(self.output_ids, self.output_slices)): if (lora_b_i := lora_b[i]) is not None: - lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size * - (shard_id + 1)] - return lora_b + sliced_lora_b[i] = lora_b_i[:, + shard_size * shard_id:shard_size * + (shard_id + 1)] + return sliced_lora_b def slice_bias( self, bias: list[Union[torch.Tensor, From 82de9b9d468dab451380d3e7dda88b0c40a31204 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 1 Aug 2025 13:44:10 +0800 Subject: [PATCH 06/54] [Misc] Automatically resolve HF processor init kwargs (#22005) Signed-off-by: DarkLight1337 --- examples/offline_inference/vision_language.py | 38 +++--- tests/lora/test_qwen2vl.py | 6 - .../multimodal/generation/test_common.py | 27 ++++- .../generation/vlm_utils/model_utils.py | 12 ++ .../processing/test_transformers.py | 2 +- tests/models/registry.py | 3 +- tests/multimodal/test_processing.py | 113 +++++++++++------- vllm/config.py | 12 +- vllm/inputs/registry.py | 17 ++- vllm/model_executor/models/aya_vision.py | 12 +- vllm/model_executor/models/deepseek_vl2.py | 36 +++--- vllm/model_executor/models/florence2.py | 6 - vllm/model_executor/models/fuyu.py | 4 +- vllm/model_executor/models/glm4_1v.py | 8 +- vllm/model_executor/models/h2ovl.py | 16 +-- .../models/hyperclovax_vision.py | 20 +--- vllm/model_executor/models/idefics3.py | 10 +- vllm/model_executor/models/internvl.py | 28 +---- vllm/model_executor/models/keye.py | 84 +------------ vllm/model_executor/models/llava.py | 46 ++----- vllm/model_executor/models/minicpmv.py | 6 +- vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/nemotron_vl.py | 24 +--- vllm/model_executor/models/nvlm_d.py | 16 +-- vllm/model_executor/models/ovis.py | 8 +- vllm/model_executor/models/phi3v.py | 11 -- vllm/model_executor/models/phi4_multimodal.py | 22 ++-- vllm/model_executor/models/phi4mm.py | 21 +--- .../models/qwen2_5_omni_thinker.py | 47 +------- vllm/model_executor/models/qwen2_5_vl.py | 19 +-- vllm/model_executor/models/qwen2_audio.py | 18 +-- vllm/model_executor/models/qwen2_vl.py | 82 +------------ vllm/model_executor/models/skyworkr1v.py | 86 ++++--------- vllm/model_executor/models/smolvlm.py | 10 +- vllm/model_executor/models/tarsier.py | 12 +- vllm/model_executor/models/transformers.py | 5 - vllm/model_executor/models/ultravox.py | 20 +--- vllm/model_executor/models/whisper.py | 15 ++- vllm/transformers_utils/processor.py | 94 +++++++++------ vllm/utils/__init__.py | 43 ------- 40 files changed, 334 insertions(+), 727 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 6f23a29e72f71..0edcd0407747c 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -449,25 +449,6 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: ) -# omni-research/Tarsier-7b -def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "omni-research/Tarsier-7b" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=4096, - limit_mm_per_prompt={modality: 1}, - ) - prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -1293,6 +1274,25 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# omni-research/Tarsier-7b +def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "omni-research/Tarsier-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: model_name = "omni-research/Tarsier2-Recap-7b" diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 604bb307b889d..76f3bc0ebf89f 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -4,8 +4,6 @@ from dataclasses import dataclass from typing import Optional import pytest -from packaging.version import Version -from transformers import __version__ as TRANSFORMERS_VERSION import vllm from vllm.assets.image import ImageAsset @@ -185,10 +183,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files): current_platform.is_rocm(), reason="Qwen2.5-VL dependency xformers incompatible with ROCm", ) -@pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) < Version("4.49.0"), - reason="Qwen2.5-VL require transformers version no lower than 4.49.0", -) def test_qwen25vl_lora(qwen25vl_lora_files): """Test Qwen 2.5 VL model with LoRA""" config = TestConfig(model_path=QWEN25VL_MODEL_PATH, diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 5bff615fb1071..967228b54a0af 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -702,13 +702,38 @@ VLM_TEST_SETTINGS = { "smolvlm": VLMTestInfo( models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - prompt_formatter=lambda img_prompt:f"<|im_start|>User:{img_prompt}\nAssistant:", # noqa: E501 + prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}\nAssistant:", # noqa: E501 img_idx_to_prompt=lambda idx: "", max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForImageTextToText, hf_output_post_proc=model_utils.smolvlm_trunc_hf_output, ), + "tarsier": VLMTestInfo( + models=["omni-research/Tarsier-7b"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:", + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, + patch_hf_runner=model_utils.tarsier_patch_hf_runner, + ), + "tarsier2": VLMTestInfo( + models=["omni-research/Tarsier2-Recap-7b"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO, + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[pytest.mark.skip("Model initialization hangs")], + ), ### Tensor parallel / multi-gpu broadcast tests "chameleon-broadcast": VLMTestInfo( models=["facebook/chameleon-7b"], diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index c1a2aa0dcafbb..5e8dac6bce96a 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -818,3 +818,15 @@ def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner: thinker.get_output_embeddings = lambda: thinker.lm_head hf_model.model = thinker return hf_model + + +def tarsier_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + from vllm.model_executor.models.tarsier import get_vision_encoder_info + + vision_encoder_info = get_vision_encoder_info(hf_model.config) + + hf_processor = hf_model.processor + if hf_processor.patch_size is None: + hf_processor.patch_size = vision_encoder_info.get_patch_size() + + return hf_model diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py index c7d1b5271ff72..54a0be99384a8 100644 --- a/tests/models/multimodal/processing/test_transformers.py +++ b/tests/models/multimodal/processing/test_transformers.py @@ -16,7 +16,7 @@ def test_multimodal_processor(model_id): model_impl="transformers", ) - mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, ) + mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) image_pil = ImageAsset('cherry_blossom').pil_image mm_data = {"image": image_pil} diff --git a/tests/models/registry.py b/tests/models/registry.py index b9e7de4e9fd11..806342a57dfab 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -465,8 +465,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { is_available_online=False), "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), - "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501 - hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501 + "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"), # noqa: E501 "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b", # noqa: E501 hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}), # noqa: E501 "VoxtralForConditionalGeneration": _HfExamplesInfo( diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 8a3f09bdbe27e..659ee9af9ddec 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -2,16 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import nullcontext -from types import MethodType -from typing import cast +from typing import Optional, cast from unittest.mock import MagicMock import numpy as np import pytest import torch -from transformers import ProcessorMixin from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, MultiModalKwargsItem, @@ -1013,57 +1012,91 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ) -class _ProcessorProxy: +class DummyProcessor: - def __init__(self, processor: ProcessorMixin) -> None: + def __init__(self, a: int = 0, b: int = 0) -> None: super().__init__() - self.__processor = processor - - def __getattr__(self, key: str): - return getattr(self.__processor, key) + self.a = a + self.b = b def __call__( self, - text=None, - images=None, - videos=None, - exists=None, - return_tensors=None, - ): - return dict(exists=exists) + a: int = 0, + c: int = 0, + return_tensors: Optional[str] = None, + ) -> dict[str, int]: + return dict(a=a, c=c) -@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy # yapf: disable +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy @pytest.mark.parametrize( - ("call_kwargs", "expected_kwargs"), + ("config_kwargs", "inference_kwargs", "expected_kwargs"), [ - # Should ignore invalid kwargs - ({"does_not_exist": 100}, {"exists": None}), - ({"exists": 1}, {"exists": 1}), - ({"does_not_exist": 100, "exists": 1}, {"exists": 1}), + ({"a": 1}, {}, {"a": 1, "b": 0}), + ({}, {"a": 1}, {"a": 1, "b": 0}), + # inference_kwargs should take precedence + ({"a": 1}, {"a": 2}, {"a": 2, "b": 0}), + # Should ignore extra kwargs + ({"a": 1, "c": 1}, {}, {"a": 1, "b": 0}), + ({"b": 1, "c": 1}, {}, {"a": 0, "b": 1}), ], ) # yapf: enable -def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): - model_config = ModelConfig(model_id) +def test_hf_processor_init_kwargs( + model_id, + config_kwargs, + inference_kwargs, + expected_kwargs, +): + # Should not be used since there is nothing to convert to tokens + mock_tokenizer = cast(AnyTokenizer, object()) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) - orig_get_hf_processor = processor.info.get_hf_processor - - def get_hf_processor(self, **kwargs): - assert kwargs == call_kwargs - return _ProcessorProxy(orig_get_hf_processor()) - - processor.info.get_hf_processor = MethodType(get_hf_processor, - processor.info) - - out_kwargs = processor._call_hf_processor( - prompt="", - mm_data={}, - mm_kwargs=call_kwargs, - tok_kwargs={}, + ctx = InputProcessingContext( + model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), + tokenizer=mock_tokenizer, ) - assert out_kwargs == expected_kwargs + processor = ctx.get_hf_processor( + DummyProcessor, # type: ignore[arg-type] + **inference_kwargs, + ) + + for k, v in expected_kwargs.items(): + assert getattr(processor, k) == v + + +# yapf: disable +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy +@pytest.mark.parametrize( + ("config_kwargs", "inference_kwargs", "expected_kwargs"), + [ + ({"a": 1}, {}, {"a": 1, "c": 0}), + ({}, {"a": 1}, {"a": 1, "c": 0}), + # inference_kwargs should take precedence + ({"a": 1}, {"a": 2}, {"a": 2, "c": 0}), + # Should ignore extra kwargs + ({"a": 1, "c": 1}, {}, {"a": 1, "c": 1}), + ({"b": 1, "c": 1}, {}, {"a": 0, "c": 1}), + ], +) +# yapf: enable +def test_hf_processor_call_kwargs( + model_id, + config_kwargs, + inference_kwargs, + expected_kwargs, +): + # Should not be used since there is nothing to convert to tokens + mock_tokenizer = cast(AnyTokenizer, object()) + + ctx = InputProcessingContext( + model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), + tokenizer=mock_tokenizer, + ) + + processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type] + + result = ctx.call_hf_processor(processor, {}, inference_kwargs) + assert result == expected_kwargs diff --git a/vllm/config.py b/vllm/config.py index edad5dd0406bf..9d5739ca11efd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,6 +11,7 @@ import textwrap import uuid import warnings from collections import Counter +from collections.abc import Mapping from contextlib import contextmanager from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass, replace) @@ -3332,7 +3333,16 @@ class MultiModalConfig: 999 if envs.VLLM_USE_V1 else 1, ) - # TODO: Add configs to init vision tower or not. + def merge_mm_processor_kwargs( + self, + inference_kwargs: Mapping[str, object], + ) -> dict[str, object]: + """ + Get the keyword arguments to pass to the multi-modal processor + according to the extra arguments passed during inference. + """ + kwargs = self.mm_processor_kwargs or {} + return kwargs | dict(inference_kwargs) @config diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 652136fbbfe73..6331a70b469aa 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -11,7 +11,7 @@ from typing_extensions import TypeVar from vllm.jsontree import JSONTree, json_map_leaves from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config -from vllm.utils import resolve_mm_processor_kwargs +from vllm.utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: from vllm.config import ModelConfig @@ -154,14 +154,11 @@ class InputProcessingContext(InputContext): assert callable(hf_processor) mm_config = self.model_config.get_multimodal_config() - base_kwargs = mm_config.mm_processor_kwargs - if base_kwargs is None: - base_kwargs = {} + merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) - merged_kwargs = resolve_mm_processor_kwargs( - base_kwargs, - kwargs, + allowed_kwargs = get_allowed_kwarg_only_overrides( hf_processor, + merged_kwargs, requires_kw_only=False, allow_var_kwargs=True, ) @@ -173,7 +170,9 @@ class InputProcessingContext(InputContext): return x try: - output = hf_processor(**data, **merged_kwargs, return_tensors="pt") + output = hf_processor(**data, + **allowed_kwargs, + return_tensors="pt") # this emulates output.to(dtype=self.model_config.dtype) if isinstance(output, BatchFeature): cast_output = json_map_leaves(maybe_cast_dtype, output.data) @@ -189,7 +188,7 @@ class InputProcessingContext(InputContext): except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " - f"on data={data} with kwargs={merged_kwargs}") + f"on data={data} with kwargs={allowed_kwargs}") raise ValueError(msg) from exc diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index a3eee9f065aea..b476a4f918bc3 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -123,16 +123,10 @@ class AyaVisionProcessingInfo(BaseProcessingInfo): return self.ctx.get_hf_config(AyaVisionConfig) def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor: - processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) + return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) - # Temporary workaround since this processor has multiple image tokens - # See https://github.com/huggingface/transformers/issues/38350 - processor._check_special_mm_tokens = lambda *args, **kwargs: None - - return processor - - def get_image_processor(self) -> GotOcr2ImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 544de5fe02d35..531018625478b 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -214,25 +214,25 @@ class DeepseekVL2MultiModalProcessor( mm_kwargs: Mapping[str, object], tok_kwargs: Mapping[str, object], ) -> BatchFeature: - if mm_data: - processed_outputs = self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(prompt=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - pixel_values = processed_outputs["pixel_values"] - # split pixel values into patches corresponding to each image - images_spatial_crop = processed_outputs["images_spatial_crop"] - patches_per_image = [ - x.prod().item() + 1 for x in images_spatial_crop - ] - pixel_values = pixel_values.split(patches_per_image) - processed_outputs["pixel_values"] = pixel_values - else: + if not mm_data: tokenizer = self.info.get_tokenizer() - processed_outputs = tokenizer(prompt, - add_special_tokens=True, - return_tensors="pt") + return tokenizer(prompt, + add_special_tokens=True, + return_tensors="pt") + + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + + pixel_values = processed_outputs["pixel_values"] + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] + patches_per_image = [x.prod().item() + 1 for x in images_spatial_crop] + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values return processed_outputs diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 399c739f408ee..56e456c2f1f2a 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -761,12 +761,6 @@ class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only): class Florence2ProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config() - - def get_hf_processor(self): - return self.ctx.get_hf_processor() - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7e1d478562a4c..b61e0361fe8c3 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -83,8 +83,8 @@ class FuyuProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object): return self.ctx.get_hf_processor(FuyuProcessor, **kwargs) - def get_image_processor(self) -> FuyuImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> FuyuImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": 1} diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index ae1bf22c704e5..5f306f05d140e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -809,11 +809,11 @@ class Glm4vProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": 1} - def get_image_processor(self) -> Glm4vImageProcessor: - return self.get_hf_processor().image_processor + def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor: + return self.get_hf_processor(**kwargs).image_processor - def get_video_processor(self) -> Glm4vVideoProcessor: - return self.get_hf_processor().video_processor + def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor: + return self.get_hf_processor(**kwargs).video_processor def _get_vision_info( self, diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 467b074f37753..c3e4f81597adb 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -392,21 +392,7 @@ class H2OVLProcessor(BaseInternVLProcessor): class H2OVLProcessingInfo(BaseInternVLProcessingInfo): - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> H2OVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - + def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: return self.ctx.init_processor( H2OVLProcessor, config=self.get_hf_config(), diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 3e8e50b35c0b7..e5c94c7f3a706 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -25,8 +25,7 @@ import torch import torch.nn as nn from timm.layers import LayerNorm, LayerNorm2d from timm.models.regnet import RegStage -from transformers import (AutoProcessor, BatchFeature, CLIPVisionConfig, - SiglipVisionConfig) +from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig @@ -80,26 +79,9 @@ HCXVisionMultimodalInputs = Union[HCXVisionMultimodalPixelInputs] class HCXVisionProcessingInfo(BaseProcessingInfo): - def get_hf_config(self): - return self.ctx.get_hf_config() - def get_vision_encoder_info(self): return get_vision_encoder_info(self.get_hf_config()) - def get_hf_processor( - self, - **kwargs: object, - ): - processor_cls = type( - AutoProcessor.from_pretrained( - self.ctx.model_config.model, - trust_remote_code=self.ctx.model_config.trust_remote_code, - )) - return self.ctx.get_hf_processor( - processor_cls, - **kwargs, - ) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 6e991d99b9638..3c01789b90066 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -88,15 +88,7 @@ ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] class Idefics3ProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Idefics3Processor: - if size is not None: - kwargs["size"] = size - + def get_hf_processor(self, **kwargs: object) -> Idefics3Processor: return self.ctx.get_hf_processor(Idefics3Processor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index a0e98ca3f8155..8e766dd4c4768 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -665,14 +665,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> BaseInternVLProcessor: + def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: @@ -882,27 +875,12 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo): return max(max_frames_per_video, 1) - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> InternVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - kwargs["video_token"] = self.get_video_token() - + def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: return self.ctx.init_processor( InternVLProcessor, config=self.get_hf_config(), tokenizer=self.get_tokenizer(), + video_token=self.get_video_token(), **kwargs, ) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 892d970aaade0..4d8aa8de0f0b1 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -44,8 +44,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.transformers_utils.processor import ( - cached_image_processor_from_config) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -980,72 +978,8 @@ class KeyeMultiModalDataParser(MultiModalDataParser): class KeyeProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - return self.ctx.get_hf_processor( - image_processor=self.get_image_processor( - min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - ), - **kwargs, - ) - - def _get_image_processor_kwargs( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - if self.ctx.model_config.mm_processor_kwargs: - kwargs.update(self.ctx.model_config.mm_processor_kwargs) - - if min_pixels is not None: - kwargs["min_pixels"] = min_pixels - - if size is None: - size = {"shortest_edge": min_pixels} - else: - size["shortest_edge"] = min_pixels - - if max_pixels is not None: - kwargs["max_pixels"] = max_pixels - - if size is None: - size = {"longest_edge": max_pixels} - else: - size["longest_edge"] = max_pixels - - if size is not None: - kwargs["size"] = size - - return kwargs - - def get_image_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - return cached_image_processor_from_config( - self.ctx.model_config, - **self._get_image_processor_kwargs( - min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - **kwargs, - ), - ) + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self, ) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -1246,20 +1180,6 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): def _get_data_parser(self) -> MultiModalDataParser: return KeyeMultiModalDataParser() - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs) - return self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(text=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0126ace09e707..c863ba406422d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -8,11 +8,9 @@ from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, import torch import torch.nn as nn -from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) -from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -307,29 +305,14 @@ class PixtralHFMultiModalProcessor( pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: - # Before/after https://github.com/huggingface/transformers/pull/35122 - if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"): - images = mm_data["images"] - assert isinstance(images, list) + # Avoid padding since we need the output for each image to be + # independent of other images for the cache to work correctly + image_sizes = processed_outputs["image_sizes"] + assert len(pixel_values) == len(image_sizes) - # Original output: (1, num_images, C, H, W) - # New output: (num_images, C, H, W) - assert (isinstance(pixel_values, list) - and len(pixel_values) == 1) - assert (isinstance(pixel_values[0], list) - and len(pixel_values[0]) == len(images)) - - processed_outputs["pixel_values"] = pixel_values[0] - else: - # Avoid padding since we need the output for each image to be - # independent of other images for the cache to work correctly - image_sizes = processed_outputs["image_sizes"] - assert len(pixel_values) == len(image_sizes) - - processed_outputs["pixel_values"] = [ - p[:, :h, :w] - for p, (h, w) in zip(pixel_values, image_sizes) - ] + processed_outputs["pixel_values"] = [ + p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes) + ] return processed_outputs @@ -784,17 +767,10 @@ class MantisProcessingInfo(LlavaProcessingInfo): vision_info = self.get_vision_encoder_info() kwargs.setdefault("patch_size", vision_info.get_patch_size()) - - if Version(TRANSFORMERS_VERSION) < Version("4.48"): - # BUG: num_additional_image_tokens = 0 but treated as 1, - # so we set vision_feature_select_strategy to None to offset this - kwargs.setdefault("vision_feature_select_strategy", None) - else: - # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 - kwargs.setdefault( - "vision_feature_select_strategy", - hf_config.vision_feature_select_strategy, - ) + kwargs.setdefault( + "vision_feature_select_strategy", + hf_config.vision_feature_select_strategy, + ) return self.ctx.get_hf_processor(LlavaProcessor, **kwargs) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 70f2d4a6420b9..e172758b2f2c5 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -331,10 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): return hf_processor - def get_image_processor(self): - hf_processor = self.get_hf_processor() - image_processor = hf_processor.image_processor # type: ignore - return image_processor + def get_image_processor(self, **kwargs: object): + return self.get_hf_processor(**kwargs).image_processor def get_model_version(self): return get_version_by_config(self.get_hf_config()) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index dea85d320adfd..924f10d82b381 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -533,7 +533,7 @@ class Mllama4ProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object) -> Llama4Processor: return self.ctx.get_hf_processor(Llama4Processor, - use_fast=True, + use_fast=kwargs.pop("use_fast", True), **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 5d0513d707413..b90cb9b39a60b 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -137,34 +137,16 @@ class NemotronVLProcessor(InternVLProcessor): class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): """Processing info for Nemotron VL models.""" - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> NemotronVLProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - image_processor = self.get_image_processor() + def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor: return self.ctx.init_processor( NemotronVLProcessor, config=self.get_hf_config(), tokenizer=self.get_tokenizer(), - image_processor=image_processor, + image_processor=self.get_image_processor(), **kwargs, ) - def get_image_processor( - self, - **kwargs: object, - ): + def get_image_processor(self, **kwargs: object): return cached_image_processor_from_config( self.ctx.model_config, **kwargs, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 2f7f8e437f0ad..4bea1392a6814 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -63,21 +63,7 @@ class NVLMProcessor(BaseInternVLProcessor): class NVLMProcessingInfo(BaseInternVLProcessingInfo): - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> NVLMProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - + def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: return self.ctx.init_processor( NVLMProcessor, config=self.get_hf_config(), diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index c8b528048b557..6b27980e0b0c3 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -25,7 +25,7 @@ import torch import torch.nn as nn from torch import Tensor from torch.nn.functional import gumbel_softmax, pad, softmax -from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig +from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear @@ -245,11 +245,12 @@ class VisualEmbedding(torch.nn.Embedding): class OvisProcessingInfo(BaseProcessingInfo): - def get_hf_processor(self, **kwargs): + def get_hf_processor(self, **kwargs: object): return self.ctx.get_hf_processor( OvisProcessor, image_pad_token=self.get_image_pad_token(), image_segment_len=self.get_image_segment_len(), + **kwargs, ) def get_image_segment_len(self) -> int: @@ -269,9 +270,6 @@ class OvisProcessingInfo(BaseProcessingInfo): text_model_type = hf_text_config.model_type return IMAGE_PAD_TOKEN_MAP.get(text_model_type) - def get_image_processor(self) -> BaseImageProcessor: - return self.get_hf_processor().image_processor # type: ignore - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index aa739f22fd7bf..9ef4f8371eb3d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -318,17 +318,6 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): class Phi3VProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - num_crops: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: - if num_crops is not None: - kwargs["num_crops"] = num_crops - - return self.ctx.get_hf_processor(**kwargs) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/phi4_multimodal.py b/vllm/model_executor/models/phi4_multimodal.py index 432b707a61591..e13b8276bf17a 100644 --- a/vllm/model_executor/models/phi4_multimodal.py +++ b/vllm/model_executor/models/phi4_multimodal.py @@ -696,19 +696,12 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> Phi4MultimodalConfig: return self.ctx.get_hf_config(Phi4MultimodalConfig) - def get_hf_processor( - self, - *, - dynamic_hd: Optional[int] = None, - **kwargs: object, - ) -> Phi4MMProcessor: - if dynamic_hd is not None: - kwargs["dynamic_hd"] = dynamic_hd + def get_hf_processor(self, **kwargs: object) -> Phi4MMProcessor: + return self.ctx.get_hf_processor(Phi4MMProcessor, **kwargs) - return self.ctx.get_hf_processor(**kwargs) - - def get_feature_extractor(self) -> Phi4MultimodalFeatureExtractor: - return self.get_hf_processor().audio_processor + def get_feature_extractor( + self, **kwargs: object) -> Phi4MultimodalFeatureExtractor: + return self.get_hf_processor(**kwargs).audio_processor def get_image_processor( self, @@ -1007,7 +1000,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): if audio_data: audio_features = processed_outputs['audio_input_features'] - sr = self.info.get_feature_extractor().sampling_rate + sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate feature_sizes = [ self.info.get_audio_num_frames(len(audio), sr) for audio in audio_data @@ -1043,7 +1036,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): audio_token_id = tokenizer.vocab[tokenizer.audio_token] hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - audio_processor = self.info.get_feature_extractor() + audio_processor = self.info.get_feature_extractor( + **hf_processor_mm_kwargs) def get_image_replacement_phi4mm(item_idx: int): images = mm_items.get_items( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 9b61c3634d841..73e8446e6dea7 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -459,17 +459,6 @@ def cat_with_pad(tensors, dim, padding_value=0): class Phi4MMProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - dynamic_hd: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: - if dynamic_hd is not None: - kwargs["dynamic_hd"] = dynamic_hd - - return self.ctx.get_hf_processor(**kwargs) - @property def image_tokens(self) -> list[str]: return [f"<|image_{i+1}|>" for i in range(100)] @@ -487,8 +476,9 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): image_processor = processor.image_processor return image_processor.dynamic_hd - def get_feature_extractor(self) -> SequenceFeatureExtractor: - return self.get_hf_processor().audio_processor + def get_feature_extractor(self, + **kwargs: object) -> SequenceFeatureExtractor: + return self.get_hf_processor(**kwargs).audio_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None, "image": None} @@ -769,7 +759,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") - sr = self.info.get_feature_extractor().sampling_rate + sr = self.info.get_feature_extractor(**mm_kwargs).sampling_rate if (audio_data := mm_data.get("audios", [])): mm_data['audios'] = [(data, sr) for data in audio_data] @@ -816,7 +806,8 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): ) -> Sequence[PromptUpdate]: image_tokens: list[str] = self.info.image_tokens # type: ignore audio_tokens: list[str] = self.info.audio_tokens # type: ignore - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) def get_image_replacement_phi4mm(item_idx: int): diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index c5a5c10d9509f..b9fed79c84cdd 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -132,50 +132,15 @@ class Qwen2_5OmniThinkerProcessingInfo(Qwen2AudioProcessingInfo, def get_hf_config(self): return self.ctx.get_hf_config(Qwen2_5OmniConfig).thinker_config - def get_hf_processor( - self, - *, - sampling_rate: Optional[int] = None, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, list[float]]] = None, - **kwargs: object, - ) -> Qwen2_5OmniProcessor: - if fps is not None: - kwargs["fps"] = fps - - # Monkey patch for Transformers v4.53 - processor_class = Qwen2_5OmniProcessor - if processor_class.image_processor_class != "AutoImageProcessor": - processor_class.image_processor_class = "AutoImageProcessor" - if processor_class.video_processor_class != "AutoVideoProcessor": - processor_class.video_processor_class = "AutoVideoProcessor" - - processor = self.ctx.get_hf_processor( - processor_class, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + def get_hf_processor(self, **kwargs: object) -> Qwen2_5OmniProcessor: + return self.ctx.get_hf_processor( + Qwen2_5OmniProcessor, + use_fast=kwargs.pop("use_fast", True), **kwargs, ) - if not hasattr(processor, "audio_token"): - processor.audio_token = "<|AUDIO|>" - if not hasattr(processor, "image_token"): - processor.image_token = "<|IMAGE|>" - if not hasattr(processor, "video_token"): - processor.video_token = "<|VIDEO|>" - return processor - def get_feature_extractor( - self, - *, - sampling_rate: Optional[int] = None, - **kwargs: object, - ): - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, **kwargs: object): + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8ae096536fdc5..c4c4650f569e1 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -780,25 +780,10 @@ class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2_5_VLConfig) - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, list[float]]] = None, - **kwargs: object, - ) -> Qwen2_5_VLProcessor: - if fps is not None: - kwargs["fps"] = fps - + def get_hf_processor(self, **kwargs: object) -> Qwen2_5_VLProcessor: return self.ctx.get_hf_processor( Qwen2_5_VLProcessor, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + use_fast=kwargs.pop("use_fast", True), **kwargs, ) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index d7fec30acd8d3..3ef55cd704cf0 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -86,22 +86,12 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2AudioConfig) - def get_hf_processor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - **kwargs: object, - ) -> Qwen2AudioProcessor: + def get_hf_processor(self, **kwargs: object) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs) - def get_feature_extractor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ad63bb4af4e9d..4e8ea8e449133 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -69,8 +69,6 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.transformers_utils.processor import ( - cached_image_processor_from_config) from vllm.transformers_utils.tokenizer import AnyTokenizer from .interfaces import (MultiModalEmbeddings, SupportsLoRA, @@ -752,73 +750,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2VLConfig) - def get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Qwen2VLProcessor: + def get_hf_processor(self, **kwargs: object) -> Qwen2VLProcessor: return self.ctx.get_hf_processor( Qwen2VLProcessor, - image_processor=self.get_image_processor(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - use_fast=kwargs.get( - "use_fast", True)), + use_fast=kwargs.pop("use_fast", True), **kwargs, ) - def _get_image_processor_kwargs( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ): - mm_config = self.ctx.model_config.get_multimodal_config() - if mm_config.mm_processor_kwargs: - kwargs.update(mm_config.mm_processor_kwargs) - - if min_pixels is not None: - kwargs["min_pixels"] = min_pixels - - if size is None: - size = {"shortest_edge": min_pixels} - else: - size["shortest_edge"] = min_pixels - - if max_pixels is not None: - kwargs["max_pixels"] = max_pixels - - if size is None: - size = {"longest_edge": max_pixels} - else: - size["longest_edge"] = max_pixels - - if size is not None: - kwargs["size"] = size - - return kwargs - - def get_image_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> Qwen2VLImageProcessor: - kwargs["use_fast"] = kwargs.get("use_fast", True) - return cached_image_processor_from_config( - self.ctx.model_config, - **self._get_image_processor_kwargs(min_pixels=min_pixels, - max_pixels=max_pixels, - size=size, - **kwargs), - ) + def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor: + return self.get_hf_processor(**kwargs).image_processor def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} @@ -1023,20 +963,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] def _get_data_parser(self) -> MultiModalDataParser: return Qwen2VLMultiModalDataParser() - def _call_hf_processor( - self, - prompt: str, - mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], - tok_kwargs: Mapping[str, object], - ) -> BatchFeature: - mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs) - return self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(text=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) - def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 5ae5c0bc1d5dc..c76aabcd27ccb 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -7,9 +7,8 @@ # Copyright (c) 2025 Skywork # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -232,7 +231,7 @@ def image_to_pixel_values_skyworkr1v( return pixel_values -class BaseSkyworkR1VProcessor(ABC): +class SkyworkR1VProcessor: """ This model doesn't define its own HF processor, so we implement our own one here. @@ -279,17 +278,18 @@ class BaseSkyworkR1VProcessor(ABC): self.use_thumbnail: bool = config.use_thumbnail @property - @abstractmethod def image_token_id(self) -> int: - raise NotImplementedError + return self.tokenizer.get_vocab()[IMG_CONTEXT] - @abstractmethod def get_image_repl( self, feature_size: int, num_patches: Optional[int], ) -> PromptUpdateDetails[str]: - raise NotImplementedError + repl_features = IMG_CONTEXT * feature_size + repl_full = IMG_START + repl_features + IMG_END + + return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) def resolve_min_max_num( self, @@ -426,35 +426,15 @@ class BaseSkyworkR1VProcessor(ABC): } -class SkyworkR1VProcessor(BaseSkyworkR1VProcessor): +class SkyworkR1VProcessingInfo(BaseProcessingInfo): - @property - def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[IMG_CONTEXT] - - def get_image_repl( - self, - feature_size: int, - num_patches: Optional[int], - ) -> PromptUpdateDetails[str]: - repl_features = IMG_CONTEXT * feature_size - repl_full = IMG_START + repl_features + IMG_END - - return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) - - -class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): - - @abstractmethod - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> BaseSkyworkR1VProcessor: - raise NotImplementedError + def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor: + return self.ctx.init_processor( + SkyworkR1VProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, + ) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -464,7 +444,7 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Optional[BaseSkyworkR1VProcessor], + processor: Optional[SkyworkR1VProcessor], ) -> int: if processor is None: processor = self.get_hf_processor() @@ -500,10 +480,8 @@ class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo): return largest_feature_pinpoint -_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo) - - -class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]): +class SkyworkR1VDummyInputsBuilder( + BaseDummyInputsBuilder[SkyworkR1VProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -527,7 +505,8 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]): } -class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]): +class SkyworkR1VMultiModalProcessor( + BaseMultiModalProcessor[SkyworkR1VProcessingInfo]): def _call_hf_processor( self, @@ -617,31 +596,6 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]): ] -class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo): - - def get_hf_processor( - self, - *, - min_dynamic_patch: Optional[int] = None, - max_dynamic_patch: Optional[int] = None, - dynamic_image_size: Optional[bool] = None, - **kwargs: object, - ) -> SkyworkR1VProcessor: - if min_dynamic_patch is not None: - kwargs["min_dynamic_patch"] = min_dynamic_patch - if max_dynamic_patch is not None: - kwargs["max_dynamic_patch"] = max_dynamic_patch - if dynamic_image_size is not None: - kwargs["dynamic_image_size"] = dynamic_image_size - - return self.ctx.init_processor( - SkyworkR1VProcessor, - config=self.get_hf_config(), - tokenizer=self.get_tokenizer(), - **kwargs, - ) - - @MULTIMODAL_REGISTRY.register_processor( SkyworkR1VMultiModalProcessor, info=SkyworkR1VProcessingInfo, diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 0f22ba5b406ce..2adfad67152b3 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -19,15 +19,7 @@ from .idefics3 import Idefics3ProcessingInfo class SmolVLMProcessingInfo(Idefics3ProcessingInfo): - def get_hf_processor( - self, - *, - max_image_size: Optional[dict[str, int]] = None, - **kwargs: object, - ) -> SmolVLMProcessor: - if max_image_size is not None: - kwargs["max_image_size"] = max_image_size - + def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor: return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs) def _get_image_token( diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py index 979d789b330cf..70cf5e95a54e1 100644 --- a/vllm/model_executor/models/tarsier.py +++ b/vllm/model_executor/models/tarsier.py @@ -178,13 +178,11 @@ class TarsierProcessingInfo(BaseProcessingInfo): return get_vision_encoder_info(self.get_hf_config()) def get_hf_processor(self, **kwargs: object) -> TarsierProcessor: - hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs) - # Patch for patch_size if needed (copied from vLLM LLaVA) - if hasattr(hf_processor, - 'patch_size') and hf_processor.patch_size is None: - patch_size = self.get_vision_encoder_info().get_patch_size() - hf_processor.patch_size = patch_size - return hf_processor + vision_info = self.get_vision_encoder_info() + + kwargs.setdefault("patch_size", vision_info.get_patch_size()) + + return self.ctx.get_hf_processor(TarsierProcessor, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 8cd95605cdfae..e67548800c354 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -48,7 +48,6 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import is_list_of from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP, @@ -189,10 +188,6 @@ class MultiModalProcessingInfo(BaseProcessingInfo): image_tokens = mm_tokens["num_image_tokens"][0] return image_tokens - def get_hf_processor(self): - processor = cached_get_processor(self.ctx.model_config.model) - return processor - def get_max_image_size(self): return 10_000, 10_000 # hardcode for arbitrary very large size diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index a4569ccd5a845..bef34c1be49fe 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -71,13 +71,7 @@ UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, class UltravoxProcessingInfo(BaseProcessingInfo): - def get_hf_processor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - **kwargs: object, - ) -> ProcessorMixin: + def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: config = self.ctx.model_config.hf_config hf_processor = self.ctx.get_hf_processor(**kwargs) @@ -89,13 +83,9 @@ class UltravoxProcessingInfo(BaseProcessingInfo): return hf_processor - def get_feature_extractor( - self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor(sampling_rate=sampling_rate) + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) audio_processor = hf_processor.audio_processor # type: ignore feature_extractor = audio_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) @@ -156,7 +146,7 @@ class UltravoxMultiModalProcessor( audios = mm_data.pop("audios", []) assert isinstance(audios, list) - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index d7bafb9ef84d9..ca02ecd828ba3 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -623,23 +623,22 @@ class WhisperProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> WhisperConfig: return self.ctx.get_hf_config(WhisperConfig) - def get_hf_processor(self, - sampling_rate: Optional[int] = None - ) -> WhisperProcessor: - # HACK: Transformers 4.53.0 has issue with whisper tokenizer to + def get_hf_processor(self, **kwargs: object) -> WhisperProcessor: + # HACK: Transformers 4.53.2 has issue with whisper tokenizer to # initialize processor. We use a monkeypatch to fix it here. # See: https://github.com/vllm-project/vllm/issues/20224 processor_class = WhisperProcessor tokenizer_class = ("WhisperTokenizer", "WhisperTokenizerFast") if processor_class.tokenizer_class != tokenizer_class: processor_class.tokenizer_class = tokenizer_class - return self.ctx.get_hf_processor(processor_class) + return self.ctx.get_hf_processor(processor_class, **kwargs) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": 1} - def get_feature_extractor(self) -> WhisperFeatureExtractor: - hf_processor = self.get_hf_processor() + def get_feature_extractor(self, + **kwargs: object) -> WhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) feature_extractor = hf_processor.feature_extractor # type: ignore assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor @@ -702,7 +701,7 @@ class WhisperMultiModalProcessor( tok_kwargs: Mapping[str, object], ) -> BatchFeature: if mm_data: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) mm_data = dict(audio=mm_data.pop("audios")) mm_kwargs = dict( **mm_kwargs, diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 70cd08263d372..a630d940b2578 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -4,9 +4,15 @@ from functools import lru_cache from typing import TYPE_CHECKING, Any, Optional, Union, cast +from transformers import (AutoFeatureExtractor, AutoImageProcessor, + AutoProcessor) +from transformers.feature_extraction_utils import FeatureExtractionMixin +from transformers.image_processing_utils import BaseImageProcessor from transformers.processing_utils import ProcessorMixin from typing_extensions import TypeVar +from vllm.utils import get_allowed_kwarg_only_overrides + if TYPE_CHECKING: from vllm.config import ModelConfig @@ -33,23 +39,42 @@ class HashableList(list): return hash(tuple(self)) -def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs): - mm_config = model_config.get_multimodal_config() - base_kwargs = mm_config.mm_processor_kwargs - if base_kwargs is None: - base_kwargs = {} +def _get_processor_factory_fn(processor_cls: Union[type, tuple[type, ...]]): + if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: + return AutoProcessor.from_pretrained + if hasattr(processor_cls, "from_pretrained"): + return processor_cls.from_pretrained - merged_kwargs = {**base_kwargs, **kwargs} + return processor_cls + + +def _merge_mm_kwargs( + model_config: "ModelConfig", + processor_cls: Union[type, tuple[type, ...]], + /, + **kwargs, +): + mm_config = model_config.get_multimodal_config() + merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) + + factory = _get_processor_factory_fn(processor_cls) + allowed_kwargs = get_allowed_kwarg_only_overrides( + factory, + merged_kwargs, + requires_kw_only=False, + allow_var_kwargs=True, + ) # NOTE: Pythonic dict is not hashable and will raise unhashable type # error when calling `cached_get_processor`, therefore we need to # wrap it to a hashable dict. - for key, value in merged_kwargs.items(): + for key, value in allowed_kwargs.items(): if isinstance(value, dict): - merged_kwargs[key] = HashableDict(value) + allowed_kwargs[key] = HashableDict(value) if isinstance(value, list): - merged_kwargs[key] = HashableList(value) - return merged_kwargs + allowed_kwargs[key] = HashableList(value) + + return allowed_kwargs def get_processor( @@ -61,21 +86,29 @@ def get_processor( **kwargs: Any, ) -> _P: """Load a processor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoProcessor - - processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or - isinstance(processor_cls, tuple) else processor_cls) + if revision is None: + revision = "main" try: - processor = processor_factory.from_pretrained( - processor_name, - *args, - revision=revision, - trust_remote_code=trust_remote_code, - **kwargs, - ) + if isinstance(processor_cls, tuple) or processor_cls == ProcessorMixin: + processor = AutoProcessor.from_pretrained( + processor_name, + *args, + revision=revision, + trust_remote_code=trust_remote_code, + **kwargs, + ) + elif issubclass(processor_cls, ProcessorMixin): + processor = processor_cls.from_pretrained( + processor_name, + *args, + revision=revision, + trust_remote_code=trust_remote_code, + **kwargs, + ) + else: + # Processors that are standalone classes unrelated to HF + processor = processor_cls(*args, **kwargs) except ValueError as e: # If the error pertains to the processor class not existing or not # currently being imported, suggest using the --trust-remote-code flag. @@ -112,7 +145,7 @@ def cached_processor_from_config( revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, processor_cls=processor_cls, # type: ignore[arg-type] - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, processor_cls, **kwargs), ) @@ -125,10 +158,6 @@ def get_feature_extractor( ): """Load an audio feature extractor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoFeatureExtractor - from transformers.feature_extraction_utils import FeatureExtractionMixin try: feature_extractor = AutoFeatureExtractor.from_pretrained( processor_name, @@ -164,7 +193,7 @@ def cached_feature_extractor_from_config( model_config.model, revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, AutoFeatureExtractor, **kwargs), ) @@ -176,11 +205,6 @@ def get_image_processor( **kwargs: Any, ): """Load an image processor for the given model name via HuggingFace.""" - # don't put this import at the top level - # it will call torch.cuda.device_count() - from transformers import AutoImageProcessor - from transformers.image_processing_utils import BaseImageProcessor - try: processor = AutoImageProcessor.from_pretrained( processor_name, @@ -217,5 +241,5 @@ def cached_image_processor_from_config( model_config.model, revision=model_config.revision, trust_remote_code=model_config.trust_remote_code, - **_merge_mm_kwargs(model_config, **kwargs), + **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs), ) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index ae978c855a8e5..a7f579b0c9c2d 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2010,49 +2010,6 @@ def supports_kw( return False -def resolve_mm_processor_kwargs( - init_kwargs: Optional[Mapping[str, object]], - inference_kwargs: Optional[Mapping[str, object]], - callable: Callable[..., object], - *, - requires_kw_only: bool = True, - allow_var_kwargs: bool = False, -) -> dict[str, Any]: - """Applies filtering to eliminate invalid mm_processor_kwargs, i.e., - those who are not explicit keywords to the given callable (of one is - given; otherwise no filtering is done), then merges the kwarg dicts, - giving priority to inference_kwargs if there are any collisions. - - In the case that no kwarg overrides are provided, returns an empty - dict so that it can still be kwarg expanded into the callable later on. - - If allow_var_kwargs=True, allows for things that can be expanded into - kwargs as long as they aren't naming collision for var_kwargs or potential - positional arguments. - """ - # Filter inference time multimodal processor kwargs provided - runtime_mm_kwargs = get_allowed_kwarg_only_overrides( - callable, - overrides=inference_kwargs, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs, - ) - - # Filter init time multimodal processor kwargs provided - init_mm_kwargs = get_allowed_kwarg_only_overrides( - callable, - overrides=init_kwargs, - requires_kw_only=requires_kw_only, - allow_var_kwargs=allow_var_kwargs, - ) - - # Merge the final processor kwargs, prioritizing inference - # time values over the initialization time values. - mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs} - - return mm_processor_kwargs - - def get_allowed_kwarg_only_overrides( callable: Callable[..., object], overrides: Optional[Mapping[str, object]], From e1a7fe4af5e9c287501c648e64956a08705af86a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= Date: Fri, 1 Aug 2025 07:45:02 +0200 Subject: [PATCH 07/54] [BugFix] fix: aot passes kvcache dtype information (#19750) Signed-off-by: Mickael Seznec --- vllm/v1/attention/backends/flash_attn.py | 25 ++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 4c2a6c6b985b2..3f9afa67aef70 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -99,6 +99,13 @@ class FlashAttentionBackend(AttentionBackend): raise ValueError(f"Unknown cache layout format {cache_layout}.") return stride_order + @staticmethod + def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype: + if kv_cache_dtype in ("fp8", "fp8_e4m3"): + return torch.float8_e4m3fn + else: + raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") + @dataclass class FlashAttentionMetadata: @@ -161,6 +168,7 @@ class FlashAttentionMetadataBuilder( self.parallel_config) self.num_heads_kv = self.model_config.get_num_kv_heads( self.parallel_config) + self.kv_cache_dtype = kv_cache_spec.dtype self.headdim = self.model_config.get_head_size() self.block_size = kv_cache_spec.block_size @@ -239,17 +247,24 @@ class FlashAttentionMetadataBuilder( def schedule(batch_size, cu_query_lens, max_query_len, seqlens, max_seq_len, causal): + cache_dtype = self.cache_config.cache_dtype + if cache_dtype.startswith("fp8"): + qkv_dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn( + cache_dtype) + else: + qkv_dtype = self.kv_cache_dtype if aot_schedule: return get_scheduler_metadata( batch_size=batch_size, max_seqlen_q=max_query_len, max_seqlen_k=max_seq_len, - cache_seqlens=seqlens, num_heads_q=self.num_heads_q, num_heads_kv=self.num_heads_kv, headdim=self.headdim, - page_size=self.block_size, + cache_seqlens=seqlens, + qkv_dtype=qkv_dtype, cu_seqlens_q=cu_query_lens, + page_size=self.block_size, causal=causal, window_size=self.aot_sliding_window, num_splits=self.max_num_splits, @@ -474,8 +489,10 @@ class FlashAttentionImpl(AttentionImpl): ) if self.kv_cache_dtype.startswith("fp8"): - key_cache = key_cache.view(torch.float8_e4m3fn) - value_cache = value_cache.view(torch.float8_e4m3fn) + dtype = FlashAttentionBackend.get_fp8_dtype_for_flashattn( + self.kv_cache_dtype) + key_cache = key_cache.view(dtype) + value_cache = value_cache.view(dtype) num_tokens, num_heads, head_size = query.shape query, _ = ops.scaled_fp8_quant( query.reshape( From 0f46a780d4f53b8564a37370f9f068cdf4e69604 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Fri, 1 Aug 2025 01:45:15 -0400 Subject: [PATCH 08/54] [Model] [Quantization] Support quantization for Gemma3n (#21974) Signed-off-by: Kyle Sayers --- vllm/model_executor/models/gemma3n.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index a58b32793dbef..e16c03c8d3b57 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -46,6 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from .interfaces import SupportsQuant from .utils import (AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_layers, maybe_prefix) @@ -68,6 +69,7 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs: int, altup_coef_clip: float, altup_active_idx: int, + quant_config: QuantizationConfig, prefix: str, ): super().__init__() @@ -80,6 +82,7 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs, altup_num_inputs, bias=False, + quant_config=quant_config, prefix=f"{prefix}.correction_coefs", return_bias=False, ) @@ -87,6 +90,7 @@ class Gemma3nAltUp(nn.Module): altup_num_inputs, altup_num_inputs**2, bias=False, + quant_config=quant_config, prefix=f"{prefix}.prediction_coefs", return_bias=False, ) @@ -94,6 +98,7 @@ class Gemma3nAltUp(nn.Module): hidden_size, altup_num_inputs, bias=False, + quant_config=quant_config, prefix=f"{prefix}.modality_router", return_bias=False, ) @@ -400,6 +405,7 @@ class Gemma3nDecoderLayer(nn.Module): altup_num_inputs=config.altup_num_inputs, altup_coef_clip=config.altup_coef_clip, altup_active_idx=config.altup_active_idx, + quant_config=quant_config, prefix=f"{prefix}.altup", ) self.self_attn = Gemma3nAttention( @@ -527,7 +533,7 @@ class Gemma3nDecoderLayer(nn.Module): @support_torch_compile -class Gemma3nTextModel(nn.Module): +class Gemma3nTextModel(nn.Module, SupportsQuant): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -540,6 +546,7 @@ class Gemma3nTextModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, prefix=f"{prefix}.embed_tokens", ) self.embed_scale = torch.tensor( @@ -549,6 +556,7 @@ class Gemma3nTextModel(nn.Module): self.embed_tokens_per_layer = VocabParallelEmbedding( config.vocab_size_per_layer_input, config.num_hidden_layers * config.hidden_size_per_layer_input, + quant_config=quant_config, prefix=f"{prefix}.per_layer_embed_tokens", ) self.embed_scale_per_layer = torch.tensor( @@ -582,7 +590,7 @@ class Gemma3nTextModel(nn.Module): gather_output=True, return_bias=False, quant_config=quant_config, - prefix=f"{prefix}.{idx-1}.altup_projections", + prefix=f"{prefix}.altup_projections.{idx-1}", ) for idx in range(1, self.config.altup_num_inputs) ]) self.altup_unembed_projections = nn.ModuleList([ @@ -593,7 +601,7 @@ class Gemma3nTextModel(nn.Module): gather_output=True, return_bias=False, quant_config=quant_config, - prefix=f"{prefix}.{idx-1}.altup_unembed_projections", + prefix=f"{prefix}.altup_unembed_projections.{idx-1}", ) for idx in range(1, self.config.altup_num_inputs) ]) @@ -774,7 +782,7 @@ class Gemma3nModel(nn.Module): **kwargs) -class Gemma3nForConditionalGeneration(nn.Module): +class Gemma3nForConditionalGeneration(nn.Module, SupportsQuant): packed_modules_mapping = { "qkv_proj": [ "q_proj", From 61dcc280faf305778c0c44597e823f40063aaed6 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 1 Aug 2025 14:10:56 +0800 Subject: [PATCH 09/54] [Doc] Add Voxtral to Supported Models page (#22059) Signed-off-by: DarkLight1337 --- docs/models/supported_models.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f5d9e3b22f2a6..56c77a1e5f118 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -713,6 +713,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `WhisperForConditionalGeneration` | Whisper | `openai/whisper-small`, `openai/whisper-large-v3-turbo`, etc. | | | | +| `VoxtralForConditionalGeneration` | Voxtral (Mistral format) | `mistralai/Voxtral-Mini-3B-2507`, `mistralai/Voxtral-Small-24B-2507`, etc. | | ✅︎ | ✅︎ | ### Pooling Models From 53d7c39271aeb0568afcae337396a972e1848586 Mon Sep 17 00:00:00 2001 From: Aviad Rossmann Date: Fri, 1 Aug 2025 09:23:18 +0300 Subject: [PATCH 10/54] Update sampling_metadata.py (#21937) Signed-off-by: Aviad Rossmann --- vllm/model_executor/sampling_metadata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 56f0f0984bfa0..66bcf1c4bfe50 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -539,37 +539,37 @@ class SamplingTensors: temperatures_t = torch.tensor( temperatures, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) top_ps_t = torch.tensor( top_ps, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) min_ps_t = torch.tensor( min_ps, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) presence_penalties_t = torch.tensor( presence_penalties, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) frequency_penalties_t = torch.tensor( frequency_penalties, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) repetition_penalties_t = torch.tensor( repetition_penalties, device="cpu", - dtype=dtype, + dtype=torch.float32, pin_memory=pin_memory, ) top_ks_t = torch.tensor( From 79731a79f09dc7bbe34dc8afbe8ef2242fb94a05 Mon Sep 17 00:00:00 2001 From: Hongsheng Liu Date: Fri, 1 Aug 2025 15:01:22 +0800 Subject: [PATCH 11/54] [Doc] Fix a syntax error of example code in structured_outputs.md (#22045) Signed-off-by: wangzi <3220100013@zju.edu.cn> Co-authored-by: wangzi <3220100013@zju.edu.cn> --- docs/features/structured_outputs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 4f737afa80f55..8a934d406f382 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -103,7 +103,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", } ], - "response_format": { + response_format={ "type": "json_schema", "json_schema": { "name": "car-description", From b4e081cb150797b12039cc1232205dbb25ca0206 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 1 Aug 2025 15:03:56 +0800 Subject: [PATCH 12/54] [Bugfix] Disable multi-modal preprocessor cache for DP (#21896) Signed-off-by: DarkLight1337 --- vllm/config.py | 6 ++++++ vllm/engine/arg_utils.py | 12 ++++++++++++ vllm/entrypoints/cli/serve.py | 5 +++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 9d5739ca11efd..93daab7d6ae97 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -871,6 +871,12 @@ class ModelConfig: return None + def set_disable_mm_preprocessor_cache(self, value: bool) -> None: + mm_config = self.get_multimodal_config() + + self.disable_mm_preprocessor_cache = value + mm_config.disable_mm_preprocessor_cache = value + def _get_encoder_config(self): return get_sentence_transformer_tokenizer_config( self.model, self.revision) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c36c79c69317e..78272d983eaf5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1197,6 +1197,18 @@ class EngineArgs: enable_multimodal_encoder_data_parallel, ) + supports_mm_preprocessor_cache = (self.data_parallel_size == 1 + or data_parallel_external_lb) + if (not supports_mm_preprocessor_cache + and model_config.is_multimodal_model + and not model_config.disable_mm_preprocessor_cache): + logger.warning( + "Multi-modal preprocessor cache is not compatible " + "with data parallelism when there does not exist a " + "one-to-one correspondance between API process and " + "EngineCore process, so the cache will be disabled.") + model_config.set_disable_mm_preprocessor_cache(True) + speculative_config = self.create_speculative_config( target_model_config=model_config, target_parallel_config=parallel_config, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 7dcba2cccdb52..bdbe71b832f4f 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -167,8 +167,9 @@ def run_multi_api_server(args: argparse.Namespace): if model_config.is_multimodal_model and not ( orig_disable_mm_preprocessor_cache): - logger.warning("Multi-model preprocessor cache will be disabled " - "for api_server_count > 1") + logger.warning( + "Multi-modal preprocessor cache is not compatible " + "with api_server_count > 1, so the cache will be disabled.") executor_class = Executor.get_class(vllm_config) log_stats = not engine_args.disable_log_stats From e0f63e4a3509a9323339eee67c96ac3c93d15923 Mon Sep 17 00:00:00 2001 From: Zebing Lin Date: Fri, 1 Aug 2025 03:23:29 -0400 Subject: [PATCH 13/54] [Core] Avoid repeated len(block_token_ids) check in hash_request_tokens (#21781) Signed-off-by: linzebing --- vllm/v1/core/kv_cache_utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 25520eb655111..eab1560b1a18c 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -567,12 +567,10 @@ def hash_request_tokens(hash_function: Any, block_size: int, ret = [] parent_block_hash_value = None - for start in range(0, len(token_ids), block_size): + # Only full blocks will be hashed + for start in range(0, len(token_ids) - block_size + 1, block_size): end = start + block_size block_token_ids = token_ids[start:end] - # Do not hash the block if it is not full. - if len(block_token_ids) < block_size: - break if req_need_extra_keys: # MM and LoRA requests need extra keys for block-hash computation. From 98df153abfcc443218aacfe61b3fd5abe2b88142 Mon Sep 17 00:00:00 2001 From: Sungyoon Jeong <157349761+n0gu-furiosa@users.noreply.github.com> Date: Fri, 1 Aug 2025 16:54:17 +0900 Subject: [PATCH 14/54] [Frontend] Align tool_choice="required" behavior with OpenAI when tools is empty (#21052) Signed-off-by: Sungyoon Jeong --- vllm/entrypoints/openai/protocol.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b6b3bf3f530e3..d77aee345843c 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -859,6 +859,15 @@ class ChatCompletionRequest(OpenAIBaseModel): 'are supported.' ) + # if tool_choice is "required" but the "tools" list is empty, + # override the data to behave like "none" to align with + # OpenAI’s behavior. + if data["tool_choice"] == "required" and isinstance( + data["tools"], list) and len(data["tools"]) == 0: + data["tool_choice"] = "none" + del data["tools"] + return data + # ensure that if "tool_choice" is specified as an object, # it matches a valid tool correct_usage_message = 'Correct usage: `{"type": "function",' \ From da31f6ad3dacea8579adfb36d64d28759dc5c095 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 1 Aug 2025 01:26:24 -0700 Subject: [PATCH 15/54] Revert precompile wheel changes (#22055) --- docker/Dockerfile | 27 +++---- requirements/test.txt | 24 ++---- setup.py | 182 ++++++++++++++++++++---------------------- vllm/envs.py | 11 +-- 4 files changed, 107 insertions(+), 137 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 413151b3edb00..0d6afca74e867 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -206,7 +206,16 @@ ARG SCCACHE_REGION_NAME=us-west-2 ARG SCCACHE_S3_NO_CREDENTIALS=0 # Flag to control whether to use pre-built vLLM wheels -ARG VLLM_USE_PRECOMPILED="" +ARG VLLM_USE_PRECOMPILED +# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed +ENV VLLM_USE_PRECOMPILED="" +RUN if [ "${VLLM_USE_PRECOMPILED}" = "1" ]; then \ + export VLLM_USE_PRECOMPILED=1 && \ + echo "Using precompiled wheels"; \ + else \ + unset VLLM_USE_PRECOMPILED && \ + echo "Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"; \ + fi # if USE_SCCACHE is set, use sccache to speed up compilation RUN --mount=type=cache,target=/root/.cache/uv \ @@ -223,8 +232,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \ && export SCCACHE_IDLE_TIMEOUT=0 \ && export CMAKE_BUILD_TYPE=Release \ - && export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \ - && export VLLM_DOCKER_BUILD_CONTEXT=1 \ && sccache --show-stats \ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \ && sccache --show-stats; \ @@ -238,22 +245,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ # Clean any existing CMake artifacts rm -rf .deps && \ mkdir -p .deps && \ - export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \ - export VLLM_DOCKER_BUILD_CONTEXT=1 && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi -# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others -RUN if [ "$VLLM_USE_PRECOMPILED" = "1" ]; then \ - echo "Cleaning up extra wheels in dist/..." && \ - # Identify the most recent manylinux1_x86_64 wheel - KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \ - if [ -n "$KEEP_WHEEL" ]; then \ - echo "Keeping wheel: $KEEP_WHEEL"; \ - find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \ - fi; \ - fi - # Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py @@ -369,7 +363,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ fi # Install vllm wheel first, so that torch etc will be installed. -# !bang RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ uv pip install --system dist/*.whl --verbose \ diff --git a/requirements/test.txt b/requirements/test.txt index 4aaca2afea266..d45048aae5809 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -22,7 +22,9 @@ aiohttp==3.10.11 aiohttp-cors==0.8.1 # via ray aiosignal==1.3.1 - # via aiohttp + # via + # aiohttp + # ray albucore==0.0.16 # via terratorch albumentations==1.4.6 @@ -137,7 +139,7 @@ contourpy==1.3.0 # via matplotlib cramjam==2.9.0 # via fastparquet -cupy-cuda12x==13.5.1 +cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 # via matplotlib @@ -224,6 +226,7 @@ frozenlist==1.5.0 # via # aiohttp # aiosignal + # ray fsspec==2024.9.0 # via # datasets @@ -600,18 +603,10 @@ opencv-python-headless==4.11.0.86 opentelemetry-api==1.35.0 # via # mlflow-skinny - # opentelemetry-exporter-prometheus # opentelemetry-sdk # opentelemetry-semantic-conventions -opentelemetry-exporter-prometheus==0.56b0 - # via ray -opentelemetry-proto==1.36.0 - # via ray opentelemetry-sdk==1.35.0 - # via - # mlflow-skinny - # opentelemetry-exporter-prometheus - # ray + # via mlflow-skinny opentelemetry-semantic-conventions==0.56b0 # via opentelemetry-sdk packaging==24.2 @@ -702,9 +697,7 @@ pqdm==0.2.0 pretrainedmodels==0.7.4 # via segmentation-models-pytorch prometheus-client==0.22.0 - # via - # opentelemetry-exporter-prometheus - # ray + # via ray propcache==0.2.0 # via yarl proto-plus==1.26.1 @@ -714,7 +707,6 @@ protobuf==5.28.3 # google-api-core # googleapis-common-protos # mlflow-skinny - # opentelemetry-proto # proto-plus # ray # tensorboardx @@ -862,7 +854,7 @@ rasterio==1.4.3 # rioxarray # terratorch # torchgeo -ray==2.48.0 +ray==2.43.0 # via -r requirements/test.in redis==5.2.0 # via tensorizer diff --git a/setup.py b/setup.py index bfa195d4395f0..64cfbb8db962b 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,6 @@ import json import logging import os import re -import shutil import subprocess import sys from pathlib import Path @@ -282,69 +281,10 @@ class cmake_build_ext(build_ext): self.copy_file(file, dst_file) -class precompiled_wheel_utils: +class repackage_wheel(build_ext): """Extracts libraries and other files from an existing wheel.""" - @staticmethod - def extract_precompiled_and_patch_package(wheel_url_or_path: str) -> dict: - import tempfile - import zipfile - - temp_dir = None - try: - if not os.path.isfile(wheel_url_or_path): - wheel_filename = wheel_url_or_path.split("/")[-1] - temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") - wheel_path = os.path.join(temp_dir, wheel_filename) - print(f"Downloading wheel from {wheel_url_or_path} " - f"to {wheel_path}") - from urllib.request import urlretrieve - urlretrieve(wheel_url_or_path, filename=wheel_path) - else: - wheel_path = wheel_url_or_path - print(f"Using existing wheel at {wheel_path}") - - package_data_patch = {} - - with zipfile.ZipFile(wheel_path) as wheel: - files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/_flashmla_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", - "vllm/cumem_allocator.abi3.so", - ] - - compiled_regex = re.compile( - r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") - file_members = list( - filter(lambda x: x.filename in files_to_copy, - wheel.filelist)) - file_members += list( - filter(lambda x: compiled_regex.match(x.filename), - wheel.filelist)) - - for file in file_members: - print(f"[extract] {file.filename}") - target_path = os.path.join(".", file.filename) - os.makedirs(os.path.dirname(target_path), exist_ok=True) - with wheel.open(file.filename) as src, open( - target_path, "wb") as dst: - shutil.copyfileobj(src, dst) - - pkg = os.path.dirname(file.filename).replace("/", ".") - package_data_patch.setdefault(pkg, []).append( - os.path.basename(file.filename)) - - return package_data_patch - finally: - if temp_dir is not None: - print(f"Removing temporary directory {temp_dir}") - shutil.rmtree(temp_dir) - - @staticmethod - def get_base_commit_in_main_branch() -> str: + def get_base_commit_in_main_branch(self) -> str: # Force to use the nightly wheel. This is mainly used for CI testing. if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: return "nightly" @@ -357,10 +297,6 @@ class precompiled_wheel_utils: ]).decode("utf-8") upstream_main_commit = json.loads(resp_json)["sha"] - # In Docker build context, .git may be immutable or missing. - if envs.VLLM_DOCKER_BUILD_CONTEXT: - return upstream_main_commit - # Check if the upstream_main_commit exists in the local repo try: subprocess.check_output( @@ -393,15 +329,92 @@ class precompiled_wheel_utils: "wheel may not be compatible with your dev branch: %s", err) return "nightly" + def run(self) -> None: + assert _is_cuda( + ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) + if wheel_location is None: + base_commit = self.get_base_commit_in_main_branch() + wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + # Fallback to nightly wheel if latest commit wheel is unavailable, + # in this rare case, the nightly release CI hasn't finished on main. + if not is_url_available(wheel_location): + wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + import zipfile + + if os.path.isfile(wheel_location): + wheel_path = wheel_location + print(f"Using existing wheel={wheel_path}") + else: + # Download the wheel from a given URL, assume + # the filename is the last part of the URL + wheel_filename = wheel_location.split("/")[-1] + + import tempfile + + # create a temporary directory to store the wheel + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + + print(f"Downloading wheel from {wheel_location} to {wheel_path}") + + from urllib.request import urlretrieve + + try: + urlretrieve(wheel_location, filename=wheel_path) + except Exception as e: + from setuptools.errors import SetupError + + raise SetupError( + f"Failed to get vLLM wheel from {wheel_location}") from e + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/_flashmla_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", + "vllm/cumem_allocator.abi3.so", + # "vllm/_version.py", # not available in nightly wheels yet + ] + + file_members = list( + filter(lambda x: x.filename in files_to_copy, wheel.filelist)) + + # vllm_flash_attn python code: + # Regex from + # `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)` + compiled_regex = re.compile( + r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py") + file_members += list( + filter(lambda x: compiled_regex.match(x.filename), + wheel.filelist)) + + for file in file_members: + print(f"Extracting and including {file.filename} " + "from existing wheel") + package_name = os.path.dirname(file.filename).replace("/", ".") + file_name = os.path.basename(file.filename) + + if package_name not in package_data: + package_data[package_name] = [] + + wheel.extract(file) + if file_name.endswith(".py"): + # python files shouldn't be added to package_data + continue + + package_data[package_name].append(file_name) + def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" def _is_cuda() -> bool: - # Allow forced CUDA in Docker/precompiled builds, even without torch.cuda - if envs.VLLM_USE_PRECOMPILED and envs.VLLM_DOCKER_BUILD_CONTEXT: - return True has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not (_is_neuron() or _is_tpu())) @@ -626,37 +639,16 @@ package_data = { ] } -# If using precompiled, extract and patch package_data (in advance of setup) -if envs.VLLM_USE_PRECOMPILED: - assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" - wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None) - if wheel_location is not None: - wheel_url = wheel_location - else: - base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch() - wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - from urllib.request import urlopen - try: - with urlopen(wheel_url) as resp: - if resp.status != 200: - wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - except Exception as e: - print(f"[warn] Falling back to nightly wheel: {e}") - wheel_url = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( - wheel_url) - for pkg, files in patch.items(): - package_data.setdefault(pkg, []).extend(files) - if _no_device(): ext_modules = [] -if not ext_modules or envs.VLLM_USE_PRECOMPILED: - # Disable build_ext when using precompiled wheel +if not ext_modules: cmdclass = {} else: - cmdclass = {"build_ext": cmake_build_ext} + cmdclass = { + "build_ext": + repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext + } setup( # static metadata should rather go in pyproject.toml diff --git a/vllm/envs.py b/vllm/envs.py index 19bc9156b2586..7553eccf16ea9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -68,7 +68,6 @@ if TYPE_CHECKING: MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None VLLM_USE_PRECOMPILED: bool = False - VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False VLLM_NO_DEPRECATION_WARNING: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False @@ -228,14 +227,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in - ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), - - # Used to mark that setup.py is running in a Docker build context, - # in order to force the use of precompiled binaries. - "VLLM_DOCKER_BUILD_CONTEXT": - lambda: os.environ.get("VLLM_DOCKER_BUILD_CONTEXT", "").strip().lower() in - ("1", "true"), + lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( + os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), # Whether to force using nightly wheel in python build. # This is used for testing the nightly wheel in python build. From 27a145e8931582fc74c1f46e0e4630c610b96160 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 1 Aug 2025 01:35:49 -0700 Subject: [PATCH 16/54] [Doc] Add example for Step3-VL (#22061) Signed-off-by: Roger Wang --- examples/offline_inference/vision_language.py | 298 ++++++++++-------- .../vision_language_multi_image.py | 215 +++++++------ 2 files changed, 286 insertions(+), 227 deletions(-) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 0edcd0407747c..a75b8e2b047d8 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -423,32 +423,6 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ) -# SmolVLM2-2.2B-Instruct -def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=2, - enforce_eager=True, - mm_processor_kwargs={ - "max_image_size": {"longest_edge": 384}, - }, - limit_mm_per_prompt={modality: 1}, - ) - prompts = [ - (f"<|im_start|>User:{question}\nAssistant:") - for question in questions - ] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - ) - - # Intern-S1 def run_interns1(questions: list[str], modality: str) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -522,44 +496,6 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ) -# Nemontron_VL -def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData: - model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=8192, - limit_mm_per_prompt={modality: 1}, - ) - - assert modality == "image" - placeholder = "" - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - messages = [ - [{"role": "user", "content": f"{placeholder}\n{question}"}] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - # Stop tokens for InternVL - # models variants may have different stop tokens - # please refer to the model card for the correct "stop words": - # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py - stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - # Keye-VL def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData: model_name = "Kwai-Keye/Keye-VL-8B-Preview" @@ -615,6 +551,41 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: ) +def run_llama4(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=4, + tensor_parallel_size=8, + gpu_memory_utilization=0.4, + limit_mm_per_prompt={modality: 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [ + [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], + } + ] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False + ) + stop_token_ids = None + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # LLaVA-1.5 def run_llava(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -857,41 +828,6 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ) -def run_llama4(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=4, - tensor_parallel_size=8, - gpu_memory_utilization=0.4, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - messages = [ - [ - { - "role": "user", - "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}], - } - ] - for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, tokenize=False - ) - stop_token_ids = None - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - # Molmo def run_molmo(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -917,6 +853,44 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: ) +# Nemontron_VL +def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData: + model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + limit_mm_per_prompt={modality: 1}, + ) + + assert modality == "image" + placeholder = "" + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"{placeholder}\n{question}"}] + for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Stop tokens for InternVL + # models variants may have different stop tokens + # please refer to the model card for the correct "stop words": + # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + # NVLM-D def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1274,6 +1248,94 @@ def run_qwen2_5_omni(questions: list[str], modality: str): ) +# SkyworkR1V +def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "Skywork/Skywork-R1V-38B" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + messages = [ + [{"role": "user", "content": f"\n{question}"}] for question in questions + ] + prompts = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + # Stop tokens for SkyworkR1V + # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py + stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + + +# SmolVLM2-2.2B-Instruct +def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + mm_processor_kwargs={ + "max_image_size": {"longest_edge": 384}, + }, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [ + (f"<|im_start|>User:{question}\nAssistant:") + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + +# Step3 +def run_step3(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "stepfun-ai/step3-fp8" + + # NOTE: Below are verified configurations for step3-fp8 + # on 8xH100 GPUs. + engine_args = EngineArgs( + model=model_name, + max_num_batched_tokens=4096, + gpu_memory_utilization=0.85, + tensor_parallel_size=8, + limit_mm_per_prompt={modality: 1}, + reasoning_parser="step3", + ) + + prompts = [ + "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n " + f"{question} <|EOT|><|BOT|>assistant\n\n" + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # omni-research/Tarsier-7b def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" @@ -1324,39 +1386,6 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData: ) -# SkyworkR1V -def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: - assert modality == "image" - - model_name = "Skywork/Skywork-R1V-38B" - - engine_args = EngineArgs( - model=model_name, - trust_remote_code=True, - max_model_len=4096, - limit_mm_per_prompt={modality: 1}, - ) - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - messages = [ - [{"role": "user", "content": f"\n{question}"}] for question in questions - ] - prompts = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - # Stop tokens for SkyworkR1V - # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py - stop_tokens = ["<|end▁of▁sentence|>", "<|endoftext|>"] - stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - - return ModelRequestData( - engine_args=engine_args, - prompts=prompts, - stop_token_ids=stop_token_ids, - ) - - model_example_map = { "aria": run_aria, "aya_vision": run_aya_vision, @@ -1373,9 +1402,9 @@ model_example_map = { "idefics3": run_idefics3, "interns1": run_interns1, "internvl_chat": run_internvl, - "nemotron_vl": run_nemotron_vl, "keye_vl": run_keye_vl, "kimi_vl": run_kimi_vl, + "llama4": run_llama4, "llava": run_llava, "llava-next": run_llava_next, "llava-next-video": run_llava_next_video, @@ -1385,8 +1414,8 @@ model_example_map = { "minicpmv": run_minicpmv, "mistral3": run_mistral3, "mllama": run_mllama, - "llama4": run_llama4, "molmo": run_molmo, + "nemotron_vl": run_nemotron_vl, "NVLM_D": run_nvlm_d, "ovis": run_ovis, "paligemma": run_paligemma, @@ -1401,6 +1430,7 @@ model_example_map = { "qwen2_5_omni": run_qwen2_5_omni, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, + "step3": run_step3, "tarsier": run_tarsier, "tarsier2": run_tarsier2, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index dd50f3639709e..1ab405fa14f3a 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -197,6 +197,53 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_hyperclovax_seed_vision( + question: str, image_urls: list[str] +) -> ModelRequestData: + model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=16384, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + message = {"role": "user", "content": list()} + for _image_url in image_urls: + message["content"].append( + { + "type": "image", + "image": _image_url, + "ocr": "", + "lens_keywords": "", + "lens_local_keywords": "", + } + ) + message["content"].append( + { + "type": "text", + "text": question, + } + ) + + prompt = tokenizer.apply_chat_template( + [ + message, + ], + tokenize=False, + add_generation_prompt=True, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + ) + + def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "HuggingFaceM4/Idefics3-8B-Llama3" @@ -225,34 +272,6 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" - - # The configuration below has been confirmed to launch on a single L40 GPU. - engine_args = EngineArgs( - model=model_name, - max_model_len=8192, - max_num_seqs=16, - enforce_eager=True, - limit_mm_per_prompt={"image": len(image_urls)}, - mm_processor_kwargs={ - "max_image_size": {"longest_edge": 384}, - }, - ) - - placeholders = "\n".join( - f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) - ) - prompt = ( - f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 - ) - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "internlm/Intern-S1" @@ -316,49 +335,36 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: ) -def load_hyperclovax_seed_vision( - question: str, image_urls: list[str] -) -> ModelRequestData: - model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) +def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" engine_args = EngineArgs( model=model_name, - trust_remote_code=True, - max_model_len=16384, + max_model_len=131072, + tensor_parallel_size=8, limit_mm_per_prompt={"image": len(image_urls)}, ) - message = {"role": "user", "content": list()} - for _image_url in image_urls: - message["content"].append( - { - "type": "image", - "image": _image_url, - "ocr": "", - "lens_keywords": "", - "lens_local_keywords": "", - } - ) - message["content"].append( + placeholders = [{"type": "image", "image": url} for url in image_urls] + messages = [ { - "type": "text", - "text": question, + "role": "user", + "content": [ + *placeholders, + {"type": "text", "text": question}, + ], } - ) + ] - prompt = tokenizer.apply_chat_template( - [ - message, - ], - tokenize=False, - add_generation_prompt=True, + processor = AutoProcessor.from_pretrained(model_name) + + prompt = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True ) return ModelRequestData( engine_args=engine_args, prompt=prompt, - stop_token_ids=None, image_data=[fetch_image(url) for url in image_urls], ) @@ -463,40 +469,6 @@ def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestDa ) -def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: - model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" - - engine_args = EngineArgs( - model=model_name, - max_model_len=131072, - tensor_parallel_size=8, - limit_mm_per_prompt={"image": len(image_urls)}, - ) - - placeholders = [{"type": "image", "image": url} for url in image_urls] - messages = [ - { - "role": "user", - "content": [ - *placeholders, - {"type": "text", "text": question}, - ], - } - ] - - processor = AutoProcessor.from_pretrained(model_name) - - prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image_data=[fetch_image(url) for url in image_urls], - ) - - def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "Kwai-Keye/Keye-VL-8B-Preview" @@ -954,6 +926,62 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct" + + # The configuration below has been confirmed to launch on a single L40 GPU. + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={ + "max_image_size": {"longest_edge": 384}, + }, + ) + + placeholders = "\n".join( + f"Image-{i}: \n" for i, _ in enumerate(image_urls, start=1) + ) + prompt = ( + f"<|im_start|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + ) + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=[fetch_image(url) for url in image_urls], + ) + + +def load_step3(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "stepfun-ai/step3-fp8" + + # NOTE: Below are verified configurations for step3-fp8 + # on 8xH100 GPUs. + engine_args = EngineArgs( + model=model_name, + max_num_batched_tokens=4096, + gpu_memory_utilization=0.85, + tensor_parallel_size=8, + limit_mm_per_prompt={"image": len(image_urls)}, + reasoning_parser="step3", + ) + + prompt = ( + "<|begin▁of▁sentence|> You are a helpful assistant. <|BOT|>user\n " + f"{'' * len(image_urls)}{question} <|EOT|><|BOT|" + ">assistant\n\n" + ) + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: model_name = "omni-research/Tarsier-7b" @@ -1006,16 +1034,16 @@ model_example_map = { "deepseek_vl_v2": load_deepseek_vl2, "gemma3": load_gemma3, "h2ovl_chat": load_h2ovl, + "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "idefics3": load_idefics3, "interns1": load_interns1, "internvl_chat": load_internvl, - "hyperclovax_seed_vision": load_hyperclovax_seed_vision, "keye_vl": load_keye_vl, "kimi_vl": load_kimi_vl, + "llama4": load_llama4, "llava": load_llava, "llava-next": load_llava_next, "llava-onevision": load_llava_onevision, - "llama4": load_llama4, "mistral3": load_mistral3, "mllama": load_mllama, "NVLM_D": load_nvlm_d, @@ -1028,6 +1056,7 @@ model_example_map = { "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, "smolvlm": load_smolvlm, + "step3": load_step3, "tarsier": load_tarsier, "tarsier2": load_tarsier2, } From e6680f9e25a433bcd754181705e72034ce6c470c Mon Sep 17 00:00:00 2001 From: wuhang Date: Fri, 1 Aug 2025 17:04:16 +0800 Subject: [PATCH 17/54] [Bugfix] Add log prefix in non-dp mode engine core (#21889) Signed-off-by: wuhang --- vllm/entrypoints/cli/serve.py | 11 +---- vllm/entrypoints/openai/api_server.py | 12 ++---- vllm/executor/multiproc_worker_utils.py | 42 ++----------------- vllm/utils/__init__.py | 55 ++++++++++++++++++++++++- vllm/v1/engine/core.py | 22 ++-------- vllm/v1/executor/multiproc_executor.py | 14 +++---- 6 files changed, 75 insertions(+), 81 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index bdbe71b832f4f..0305354a66e85 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -2,9 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse -import os import signal -import sys from typing import Optional import uvloop @@ -18,10 +16,9 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) -from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_tcp_uri +from vllm.utils import FlexibleArgumentParser, decorate_logs, get_tcp_uri from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines from vllm.v1.executor.abstract import Executor @@ -229,11 +226,7 @@ def run_api_server_worker_proc(listen_address, """Entrypoint for individual API server worker processes.""" # Add process-specific prefix to stdout and stderr. - from multiprocessing import current_process - process_name = current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) + decorate_logs() uvloop.run( run_server_worker(listen_address, sock, args, client_config, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 05d9a69a65f83..26db1357da4d0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -11,7 +11,6 @@ import multiprocessing import os import signal import socket -import sys import tempfile import uuid from argparse import Namespace @@ -95,15 +94,15 @@ from vllm.entrypoints.openai.serving_transcription import ( from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.entrypoints.utils import (cli_env_setup, load_aware_call, log_non_default_args, with_cancellation) -from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address, set_process_title, set_ulimit) +from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, + get_open_zmq_ipc_path, is_valid_ipv6_address, + set_process_title, set_ulimit) from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -1808,10 +1807,7 @@ async def run_server(args, **uvicorn_kwargs) -> None: """Run a single-worker API server.""" # Add process-specific prefix to stdout and stderr. - process_name = "APIServer" - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) + decorate_logs("APIServer") listen_address, sock = setup_server(args) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index a6c172beff7bb..48b3479ed7997 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -3,21 +3,20 @@ import asyncio import os -import sys import threading import uuid from dataclasses import dataclass from multiprocessing import Queue from multiprocessing.connection import wait from multiprocessing.process import BaseProcess -from typing import (Any, Callable, Dict, Generic, List, Optional, TextIO, - TypeVar, Union) +from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union import torch from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import _maybe_force_spawn, get_mp_context, run_method +from vllm.utils import (_maybe_force_spawn, decorate_logs, get_mp_context, + run_method) logger = init_logger(__name__) @@ -25,10 +24,6 @@ T = TypeVar('T') _TERMINATE = "TERMINATE" # sentinel -# ANSI color codes -CYAN = '\033[1;36m' -RESET = '\033[0;0m' - JOIN_TIMEOUT_S = 2 @@ -213,9 +208,7 @@ def _run_worker_process( # Add process-specific prefix to stdout and stderr process_name = get_mp_context().current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) + decorate_logs(process_name) # Initialize worker worker = worker_factory(vllm_config, rank) @@ -260,33 +253,6 @@ def _run_worker_process( logger.info("Worker exiting") -def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: - """Prepend each output line with process-specific prefix""" - - prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " - file_write = file.write - - def write_with_prefix(s: str): - if not s: - return - if file.start_new_line: # type: ignore[attr-defined] - file_write(prefix) - idx = 0 - while (next_idx := s.find('\n', idx)) != -1: - next_idx += 1 - file_write(s[idx:next_idx]) - if next_idx == len(s): - file.start_new_line = True # type: ignore[attr-defined] - return - file_write(prefix) - idx = next_idx - file_write(s[idx:]) - file.start_new_line = False # type: ignore[attr-defined] - - file.start_new_line = True # type: ignore[attr-defined] - file.write = write_with_prefix # type: ignore[method-assign] - - def set_multiprocessing_worker_envs(parallel_config): """ Set up environment variables that should be used when there are workers in a multiprocessing environment. This should be called by the parent diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index a7f579b0c9c2d..d5d8d9dad73a8 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -47,7 +47,7 @@ from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps from types import MappingProxyType from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple, - Optional, Tuple, TypeVar, Union, cast, overload) + Optional, TextIO, Tuple, TypeVar, Union, cast, overload) from urllib.parse import urlparse from uuid import uuid4 @@ -167,6 +167,10 @@ GB_bytes = 1_000_000_000 GiB_bytes = 1 << 30 """The number of bytes in one gibibyte (GiB).""" +# ANSI color codes +CYAN = '\033[1;36m' +RESET = '\033[0;0m' + STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.half, "bfloat16": torch.bfloat16, @@ -3258,3 +3262,52 @@ def set_process_title(name: str, else: name = f"{envs.VLLM_PROCESS_NAME_PREFIX}::{name}" setproctitle.setproctitle(name) + + +def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None: + """Prepend each output line with process-specific prefix""" + + prefix = f"{CYAN}({worker_name} pid={pid}){RESET} " + file_write = file.write + + def write_with_prefix(s: str): + if not s: + return + if file.start_new_line: # type: ignore[attr-defined] + file_write(prefix) + idx = 0 + while (next_idx := s.find('\n', idx)) != -1: + next_idx += 1 + file_write(s[idx:next_idx]) + if next_idx == len(s): + file.start_new_line = True # type: ignore[attr-defined] + return + file_write(prefix) + idx = next_idx + file_write(s[idx:]) + file.start_new_line = False # type: ignore[attr-defined] + + file.start_new_line = True # type: ignore[attr-defined] + file.write = write_with_prefix # type: ignore[method-assign] + + +def decorate_logs(process_name: Optional[str] = None) -> None: + """ + Adds a process-specific prefix to each line of output written to stdout and + stderr. + + This function is intended to be called before initializing the api_server, + engine_core, or worker classes, so that all subsequent output from the + process is prefixed with the process name and PID. This helps distinguish + log output from different processes in multi-process environments. + + Args: + process_name: Optional; the name of the process to use in the prefix. + If not provided, the current process name from the multiprocessing + context is used. + """ + if process_name is None: + process_name = get_mp_context().current_process().name + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f9a6315df8af8..6ae5736df98b8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,7 +3,6 @@ import os import queue import signal -import sys import threading import time from collections import deque @@ -19,15 +18,14 @@ import zmq from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group -from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import (make_zmq_socket, resolve_obj_by_qualname, - set_process_title) +from vllm.utils import (decorate_logs, make_zmq_socket, + resolve_obj_by_qualname, set_process_title) from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, unify_kv_cache_configs) from vllm.v1.core.sched.interface import SchedulerInterface @@ -649,12 +647,14 @@ class EngineCoreProc(EngineCore): "vllm_config"].parallel_config if parallel_config.data_parallel_size > 1 or dp_rank > 0: set_process_title("DPEngineCore", str(dp_rank)) + decorate_logs() # Set data parallel rank for this engine process. parallel_config.data_parallel_rank = dp_rank parallel_config.data_parallel_rank_local = local_dp_rank engine_core = DPEngineCoreProc(*args, **kwargs) else: set_process_title("EngineCore") + decorate_logs() engine_core = EngineCoreProc(*args, **kwargs) engine_core.run_busy_loop() @@ -905,8 +905,6 @@ class DPEngineCoreProc(EngineCoreProc): log_stats: bool, client_handshake_address: Optional[str] = None, ): - self._decorate_logs() - # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. self.counter = 0 @@ -919,15 +917,6 @@ class DPEngineCoreProc(EngineCoreProc): executor_class, log_stats, client_handshake_address, dp_rank) - def _decorate_logs(self): - # Add process-specific prefix to stdout and stderr before - # we initialize the engine. - from multiprocessing import current_process - process_name = current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) - def _init_data_parallel(self, vllm_config: VllmConfig): # Configure GPUs and stateless process group for data parallel. @@ -1149,9 +1138,6 @@ class DPEngineCoreActor(DPEngineCoreProc): f"{(local_dp_rank + 1) * world_size}) " f"base value: \"{os.getenv(device_control_env_var)}\"") from e - def _decorate_logs(self): - pass - @contextmanager def _perform_handshakes(self, handshake_address: str, identity: bytes, local_client: bool, vllm_config: VllmConfig, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 8270385053852..d90051c3224fd 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -4,7 +4,6 @@ import multiprocessing import os import pickle import signal -import sys import threading import time import traceback @@ -28,10 +27,11 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.executor.multiproc_worker_utils import ( - _add_prefix, set_multiprocessing_worker_envs) + set_multiprocessing_worker_envs) from vllm.logger import init_logger -from vllm.utils import (get_distributed_init_method, get_loopback_ip, - get_mp_context, get_open_port, set_process_title) +from vllm.utils import (decorate_logs, get_distributed_init_method, + get_loopback_ip, get_mp_context, get_open_port, + set_process_title) from vllm.v1.executor.abstract import Executor, FailureCallback from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase @@ -382,11 +382,11 @@ class WorkerProc: pp_str = f"PP{rank // tp_size}" if pp_size > 1 else "" tp_str = f"TP{rank % tp_size}" if tp_size > 1 else "" suffix = f"{pp_str}{'_' if pp_str and tp_str else ''}{tp_str}" + process_name = "VllmWorker" if suffix: set_process_title(suffix, append=True) - pid = os.getpid() - _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid) - _add_prefix(sys.stderr, f"VllmWorker rank={rank}", pid) + process_name = f"{process_name} {suffix}" + decorate_logs(process_name) # Initialize MessageQueue for receiving SchedulerOutput self.rpc_broadcast_mq = MessageQueue.create_from_handle( From 0f81b310db013ec9fbc1deb9de97bd9b2a9af62f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 1 Aug 2025 02:11:40 -0700 Subject: [PATCH 18/54] [Misc] Remove upper bound in openai package version (#22060) Signed-off-by: Woosuk Kwon --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index d29b3e59d35b2..6b57a3d2f1d0d 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -13,7 +13,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization. protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp -openai >= 1.87.0, <= 1.90.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support) +openai >= 1.87.0 # Ensure modern openai package (ensure ResponsePrompt exists in type.responses and max_completion_tokens field support) pydantic >= 2.10 prometheus_client >= 0.18.0 pillow # Required for image processing From 49314869887e169be080201ab8bcda14e745c080 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Fri, 1 Aug 2025 17:11:56 +0800 Subject: [PATCH 19/54] [Doc] Added warning of speculating with draft model (#22047) Signed-off-by: Dilute-l Co-authored-by: Dilute-l --- docs/features/spec_decode.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md index be4b91feda7aa..89d5b489e1888 100644 --- a/docs/features/spec_decode.md +++ b/docs/features/spec_decode.md @@ -15,6 +15,10 @@ Speculative decoding is a technique which improves inter-token latency in memory The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. +!!! warning + In vllm v0.10.0, speculative decoding with a draft model is not supported. + If you use the following code, you will get a `NotImplementedError`. + ??? code ```python From 28b18cc741e596ea6f9981b8365c4819523fc24b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 1 Aug 2025 19:09:54 +0800 Subject: [PATCH 20/54] [Quantization] Enable BNB support for InternS1 (#21953) Signed-off-by: Jee Jee Li --- .../model_loader/bitsandbytes_loader.py | 39 ++++++++++++------- vllm/model_executor/utils.py | 20 +++++++++- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 68fcb785691c8..f54dfab5238e1 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -34,7 +34,8 @@ from vllm.model_executor.model_loader.weight_utils import ( filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models import is_pooling_model -from vllm.model_executor.utils import (get_packed_modules_mapping, +from vllm.model_executor.utils import (get_moe_expert_mapping, + get_packed_modules_mapping, set_weight_attrs) from vllm.platforms import current_platform @@ -43,6 +44,12 @@ from vllm.platforms import current_platform logger = init_logger(__name__) +def is_moe_model(model: torch.nn.Module) -> bool: + """Checks if the model contains FusedMoE layers.""" + return bool(any( + isinstance(module, FusedMoE) for module in model.modules())) + + class BitsAndBytesModelLoader(BaseModelLoader): """Model loader to load model weights with BitAndBytes quantization.""" @@ -61,6 +68,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): # Store all module names (from transformers) that support # BNB quantization. self.target_modules: list[str] = [] + # Store the mapping of expert parameters for MoE models. + self.expert_params_mapping: list[tuple[str, str, int, str]] = [] # mapping weight names from transformers to vllm. self.weight_mapper: Callable = lambda name: name self.pre_quant: bool = False @@ -413,13 +422,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): # in case model has a mixture of disk-merged and disk-split # weights with same last name. self.target_modules.append(name) - elif (isinstance(module, FusedMoE) - and hasattr(module.quant_method, "quant_config")): - if not hasattr(model, "get_expert_mapping"): - raise AttributeError( - f"MoE Model {type(model).__name__} does not support " - "BitsAndBytes quantization yet. Ensure this model has " - "'get_expert_mapping' method.") + elif isinstance(module, FusedMoE) and hasattr( + module.quant_method, "quant_config"): # TODO: support FusedMoE with prequant and 8bit. if self.pre_quant: raise ValueError( @@ -430,9 +434,9 @@ class BitsAndBytesModelLoader(BaseModelLoader): "BitsAndBytes 8bit quantization with FusedMoE is not " "supported yet.") # Get the corresponding weight name using module name and - # get_expert_mapping. - expert_mapping = model.get_expert_mapping() - for exp in expert_mapping: + # expert_params_mapping. + + for exp in self.expert_params_mapping: weight_name = exp[1] rep_name = name.replace("experts", "") + weight_name.removesuffix(".") @@ -464,7 +468,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): elif isinstance(module, (RowParallelLinear, )): self.column_sharded_weights_modules.append(name) elif isinstance(module, FusedMoE): - expert_mapping = model.get_expert_mapping() + expert_mapping = self.expert_params_mapping for exp in expert_mapping: if exp[-1] == "w2": weight_name = exp[1] @@ -516,6 +520,13 @@ class BitsAndBytesModelLoader(BaseModelLoader): self.is_pool_model = is_pooling_model(model) self.modules_mapping = ParamMapping(get_packed_modules_mapping(model)) + if is_moe_model(model): + self.expert_params_mapping = get_moe_expert_mapping(model) + if not self.expert_params_mapping: + raise AttributeError( + f"MoE Model {type(model).__name__} does not support " + "BitsAndBytes quantization yet. Ensure this model has " + "'get_expert_mapping' method.") # For some models like Molmo, we need to use hf_to_vllm_mapper # to ensure correct loading of weights. if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): @@ -569,10 +580,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): """ from bitsandbytes.functional import QuantState - if not hasattr(model, "get_expert_mapping"): + if not self.expert_params_mapping: return dict() - expert_mapping = model.get_expert_mapping() + expert_mapping = self.expert_params_mapping expert_qs_dict = {} for name, module in model.named_modules(): if not isinstance(module, FusedMoE): diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 2b20ca2a3ba3f..41ed0b09c5a2a 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utils for model executor.""" + import copy from typing import Any, Optional @@ -9,6 +10,7 @@ import torch def set_random_seed(seed: int) -> None: from vllm.platforms import current_platform + current_platform.seed_everything(seed) @@ -29,7 +31,7 @@ def set_weight_attrs( return for key, value in weight_attrs.items(): assert not hasattr( - weight, key), (f"Overwriting existing tensor attribute: {key}") + weight, key), f"Overwriting existing tensor attribute: {key}" # NOTE(woosuk): During weight loading, we often do something like: # narrowed_tensor = param.data.narrow(0, offset, len) @@ -41,6 +43,7 @@ def set_weight_attrs( # we sync the param tensor after its weight loader is called. # TODO(woosuk): Remove this hack once we have a better solution. from vllm.platforms import current_platform + if current_platform.is_tpu() and key == "weight_loader": value = _make_synced_weight_loader(value) setattr(weight, key, value) @@ -77,4 +80,17 @@ def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: f"safely because of conflicts from {type(child).__name__}.") else: parent_map.update(child_map) - return parent_map \ No newline at end of file + return parent_map + + +def get_moe_expert_mapping( + model: torch.nn.Module, ) -> list[tuple[str, str, int, str]]: + if parent_map := getattr(model, "get_expert_mapping", None): + return parent_map() + else: + # We only check main components instead of whole model submodules + for child in model.children(): + child_map = getattr(child, "get_expert_mapping", None) + if child_map is not None: + return child_map() + return [] From 87c94bc87943818ad039d5c916df793fbd081e6a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 13:24:46 +0100 Subject: [PATCH 21/54] Revert "Update sampling_metadata.py (#21937)" (#22088) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/sampling_metadata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 66bcf1c4bfe50..56f0f0984bfa0 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -539,37 +539,37 @@ class SamplingTensors: temperatures_t = torch.tensor( temperatures, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) top_ps_t = torch.tensor( top_ps, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) min_ps_t = torch.tensor( min_ps, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) presence_penalties_t = torch.tensor( presence_penalties, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) frequency_penalties_t = torch.tensor( frequency_penalties, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) repetition_penalties_t = torch.tensor( repetition_penalties, device="cpu", - dtype=torch.float32, + dtype=dtype, pin_memory=pin_memory, ) top_ks_t = torch.tensor( From dfbc1f88807a1bddb75fc1dd587922567d7c133f Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 1 Aug 2025 08:25:18 -0400 Subject: [PATCH 22/54] [Speculative Decoding] Add `speculators` config support (#21345) --- .../speculators/test_eagle3.py | 16 ++++ vllm/config.py | 20 +++- vllm/engine/arg_utils.py | 22 ++++- vllm/model_executor/models/llama_eagle3.py | 26 +++++- vllm/transformers_utils/config.py | 32 ++++++- vllm/transformers_utils/configs/__init__.py | 2 + .../configs/speculators/__init__.py | 2 + .../configs/speculators/algos.py | 32 +++++++ .../configs/speculators/base.py | 91 +++++++++++++++++++ 9 files changed, 232 insertions(+), 11 deletions(-) create mode 100644 tests/speculative_decoding/speculators/test_eagle3.py create mode 100644 vllm/transformers_utils/configs/speculators/__init__.py create mode 100644 vllm/transformers_utils/configs/speculators/algos.py create mode 100644 vllm/transformers_utils/configs/speculators/base.py diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py new file mode 100644 index 0000000000000..c58fc8c0dc5f4 --- /dev/null +++ b/tests/speculative_decoding/speculators/test_eagle3.py @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +import torch + + +@pytest.mark.parametrize( + "model_path", + [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717"), + ("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) +def test_llama(vllm_runner, example_prompts, model_path): + with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens=20) + print(vllm_outputs) + assert vllm_outputs diff --git a/vllm/config.py b/vllm/config.py index 93daab7d6ae97..2d61552c5dadc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -39,8 +39,8 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - try_get_generation_config, try_get_safetensors_metadata, - try_get_tokenizer_config, uses_mrope) + maybe_override_with_speculators_target_model, try_get_generation_config, + try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect # yapf conflicts with isort for this block @@ -535,6 +535,15 @@ class ModelConfig: "affect the random state of the Python process that " "launched vLLM.", self.seed) + if self.runner != "draft": + # If we're not running the draft model, check for speculators config + # If speculators config, set model / tokenizer to be target model + self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501 + model=self.model, + tokenizer=self.tokenizer, + revision=self.revision, + trust_remote_code=self.trust_remote_code) + # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name(self.model, self.served_model_name) @@ -606,8 +615,8 @@ class ModelConfig: self.config_format, hf_overrides_kw=hf_overrides_kw, hf_overrides_fn=hf_overrides_fn) - self.hf_config = hf_config + self.hf_config = hf_config self.hf_text_config = get_hf_text_config(self.hf_config) self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None) @@ -2980,10 +2989,13 @@ class SpeculativeConfig: "Chunked prefill and EAGLE are not compatible " "when using V0.") + from vllm.transformers_utils.configs import ( + SpeculatorsConfig) from vllm.transformers_utils.configs.eagle import ( EAGLEConfig) + if isinstance(self.draft_model_config.hf_config, - EAGLEConfig): + (EAGLEConfig, SpeculatorsConfig)): pass else: eagle_config = EAGLEConfig( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 78272d983eaf5..efa077a88270a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -978,8 +978,28 @@ class EngineArgs: provided as a JSON string input via CLI arguments or directly as a dictionary from the engine. """ + + from vllm.transformers_utils.config import get_config + from vllm.transformers_utils.configs.speculators.base import ( + SpeculatorsConfig) + if self.speculative_config is None: - return None + hf_config = get_config(self.hf_config_path or self.model, + self.trust_remote_code, self.revision, + self.code_revision, self.config_format) + + # if loading a SpeculatorsConfig, load the specualtive_config + # details from the config directly + # no user input required / expected + if isinstance(hf_config, SpeculatorsConfig): + # We create one since we dont create one + self.speculative_config = {} + self.speculative_config[ + "num_speculative_tokens"] = hf_config.num_lookahead_tokens + self.speculative_config["model"] = self.model + self.speculative_config["method"] = hf_config.method + else: + return None # Note(Shangming): These parameters are not obtained from the cli arg # '--speculative-config' and must be passed in when creating the engine diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 71275f0d58579..572930c39a846 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -51,6 +51,25 @@ class LlamaDecoderLayer(LlamaDecoderLayer): self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + if getattr(config, "norm_before_residual", False): + self._residual_norm = self._norm_before_residual + else: + self._residual_norm = self._norm_after_residual + + def _norm_before_residual( + self, + hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + hidden_states = self.hidden_norm(hidden_states) + residual = hidden_states + return hidden_states, residual + + def _norm_after_residual( + self, + hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + residual = hidden_states + hidden_states = self.hidden_norm(hidden_states) + return hidden_states, residual + def forward( self, positions: torch.Tensor, @@ -59,9 +78,10 @@ class LlamaDecoderLayer(LlamaDecoderLayer): residual: Optional[torch.Tensor], ) -> tuple[torch.Tensor, torch.Tensor]: - residual = hidden_states embeds = self.input_layernorm(embeds) - hidden_states = self.hidden_norm(hidden_states) + + hidden_states, residual = self._residual_norm( + hidden_states=hidden_states) hidden_states = torch.cat([embeds, hidden_states], dim=-1) # Self Attention @@ -102,7 +122,7 @@ class LlamaModel(nn.Module): self.layers = nn.ModuleList([ LlamaDecoderLayer( - self.config, + config=self.config, prefix=maybe_prefix(prefix, f"layers.{start_layer_id}"), ) ]) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index fcaa48c1392a3..0e633c2c0b6ae 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -35,8 +35,9 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DeepseekVLV2Config, MllamaConfig, MLPSpeculatorConfig, Nemotron_Nano_VL_Config, NemotronConfig, NVLM_D_Config, - RWConfig, Step3TextConfig, - Step3VLConfig, UltravoxConfig) + RWConfig, SpeculatorsConfig, + Step3TextConfig, Step3VLConfig, + UltravoxConfig) # yapf: enable from vllm.transformers_utils.configs.mistral import adapt_config_dict from vllm.transformers_utils.utils import check_gguf_file @@ -81,6 +82,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = { "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, "eagle": EAGLEConfig, + "speculators": SpeculatorsConfig, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, "ultravox": UltravoxConfig, @@ -287,6 +289,27 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig: return config +def maybe_override_with_speculators_target_model( + model: str, + tokenizer: str, + trust_remote_code: bool, + revision: Optional[str] = None) -> tuple[str, str]: + """ + If running a speculators config, override running model with target model + """ + config_dict, _ = PretrainedConfig.get_config_dict( + model, + revision=revision, + trust_remote_code=trust_remote_code, + token=_get_hf_token(), + ) + spec_config = config_dict.get("speculators_config") + # Return the target model + if spec_config is not None: + model = tokenizer = spec_config["verifier"]["name_or_path"] + return model, tokenizer + + def get_config( model: Union[str, Path], trust_remote_code: bool, @@ -345,9 +368,12 @@ def get_config( token=_get_hf_token(), **kwargs, ) - # Use custom model class if it's in our registry model_type = config_dict.get("model_type") + if model_type is None: + model_type = "speculators" if config_dict.get( + "speculators_config") is not None else model_type + if model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] config = config_class.from_pretrained( diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 96733da726181..64ace167a5a00 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -24,6 +24,7 @@ from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig from vllm.transformers_utils.configs.nemotron_vl import Nemotron_Nano_VL_Config from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config +from vllm.transformers_utils.configs.speculators.base import SpeculatorsConfig from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig, Step3VisionEncoderConfig, Step3VLConfig) @@ -44,6 +45,7 @@ __all__ = [ "NemotronHConfig", "Nemotron_Nano_VL_Config", "NVLM_D_Config", + "SpeculatorsConfig", "UltravoxConfig", "Step3VLConfig", "Step3VisionEncoderConfig", diff --git a/vllm/transformers_utils/configs/speculators/__init__.py b/vllm/transformers_utils/configs/speculators/__init__.py new file mode 100644 index 0000000000000..208f01a7cb5ee --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/transformers_utils/configs/speculators/algos.py b/vllm/transformers_utils/configs/speculators/algos.py new file mode 100644 index 0000000000000..efc87b6bcf26f --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/algos.py @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +SUPPORTED_SPECULATORS_TYPES = {} + + +def register_speculator(name): + + def decorator(fn): + SUPPORTED_SPECULATORS_TYPES[name] = fn + return fn + + return decorator + + +@register_speculator("eagle3") +def update_eagle3(config_dict: dict, vllm_config: dict) -> None: + """ + Apply Eagle-3 specific configuration transformations. + + Eagle-3 specific fields: + - draft_vocab_size: Size of the draft model's vocabulary + - target_hidden_size: Hidden size of the target model + - norm_before_residual: Whether to apply norm before residual connection + """ + + vllm_config["draft_vocab_size"] = config_dict.get("draft_vocab_size") + if config_dict.get("target_hidden_size") is not None: + vllm_config["target_hidden_size"] = config_dict["target_hidden_size"] + vllm_config["norm_before_residual"] = config_dict.get( + "norm_before_residual", True) + vllm_config["architectures"] = ["Eagle3LlamaForCausalLM"] diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py new file mode 100644 index 0000000000000..d7c16e180c709 --- /dev/null +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from typing import Any, Union + +from transformers import PretrainedConfig + +from vllm.transformers_utils.configs.speculators.algos import ( + SUPPORTED_SPECULATORS_TYPES) + +__all__ = ["SpeculatorsConfig"] + + +class SpeculatorsConfig(PretrainedConfig): + model_type = "speculators" + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + **kwargs, + ) -> "SpeculatorsConfig": + """Load speculators Eagle config and convert to vLLM format.""" + config_dict, _ = cls.get_config_dict(pretrained_model_name_or_path, + **kwargs) + + speculators_model_type = config_dict.get("speculators_model_type") + if speculators_model_type not in SUPPORTED_SPECULATORS_TYPES: + raise ValueError( + f"Expected one of: {SUPPORTED_SPECULATORS_TYPES}. " + "Please ensure you're loading a speculators-format model.") + + # validate fields + # TODO: @dsikka - use speculators pydantic model to validate + cls.validate_speculators_config(config_dict=config_dict) + # Convert from speculators config -> format that can be ingested by vLLM + vllm_config = cls.convert_speculators_to_vllm(config_dict=config_dict) + # Apply anything specific to the supported algorithm + algo_updater = SUPPORTED_SPECULATORS_TYPES[speculators_model_type] + algo_updater(config_dict=config_dict, vllm_config=vllm_config) + return cls(**vllm_config) + + @classmethod + def validate_speculators_config(cls, config_dict: dict[str, Any]) -> None: + try: + spec_config = config_dict["speculators_config"] + methods = spec_config["proposal_methods"] + first_method = methods[0] + _ = first_method["speculative_tokens"] + _ = spec_config["verifier"]["name_or_path"] + _ = config_dict["speculators_model_type"] + except (KeyError, IndexError, TypeError) as e: + raise ValueError("Invalid speculators config structure") from e + + if "transformer_layer_config" not in config_dict: + raise ValueError("Must provide transformer_layer_config") + + if not isinstance(config_dict["transformer_layer_config"], dict): + raise TypeError( + "'transformer_layer_config' must be a dictionary if provided") + + @classmethod + def convert_speculators_to_vllm( + cls, config_dict: dict[str, Any]) -> dict[str, Any]: + """ + Convert speculators config format to vLLM format. + + This method handles the translation of field names and structure + between speculators and vLLM formats. + + Returns: + Dictionary with vLLM-compatible configuration + """ + # Currently we only support one proposal method + spec_config = config_dict["speculators_config"] + first_method = spec_config.get("proposal_methods")[0] + num_lookahead_tokens = first_method.get("speculative_tokens") + + if num_lookahead_tokens is None: + raise ValueError( + "Missing 'speculative_tokens' in proposal method. " + f"Got: {first_method}") + + # Build base vLLM config + vllm_config = { + "method": config_dict.get("speculators_model_type"), + "num_lookahead_tokens": num_lookahead_tokens, + "target_model": spec_config.get("verifier")["name_or_path"] + } + vllm_config.update(config_dict["transformer_layer_config"]) + return vllm_config From 26b5f7bd2a4005dccb797804c93cbce329253003 Mon Sep 17 00:00:00 2001 From: TJian Date: Fri, 1 Aug 2025 05:25:20 -0700 Subject: [PATCH 23/54] [BUG] [ROCm] Fix import bug on ROCm (#22083) Signed-off-by: tjtanaa --- vllm/compilation/pass_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 11e03daced160..54f00d5415216 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -7,7 +7,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -if current_platform.is_cuda_alike(): +if current_platform.is_cuda(): from .fusion import FusionPass from .collective_fusion import AllReduceFusionPass, AsyncTPPass from .fusion_attn import AttnFusionPass From fb0e0d46fc443f08bc2a859b839f0f66c6a7f670 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 13:26:42 +0100 Subject: [PATCH 24/54] Fix `get_kwargs` for case where type hint is `list[Union[str, type]]` (#22016) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/engine/test_arg_utils.py | 7 ++++++- vllm/engine/arg_utils.py | 10 ++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 1d1926068d28c..c282bf002304a 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -5,7 +5,7 @@ import json from argparse import ArgumentError from contextlib import nullcontext from dataclasses import dataclass, field -from typing import Annotated, Literal, Optional +from typing import Annotated, Literal, Optional, Union import pytest @@ -136,6 +136,8 @@ class DummyConfig: """List with variable length""" list_literal: list[Literal[1, 2]] = field(default_factory=list) """List with literal choices""" + list_union: list[Union[str, type[object]]] = field(default_factory=list) + """List with union type""" literal_literal: Literal[Literal[1], Literal[2]] = 1 """Literal of literals with default 1""" json_tip: dict = field(default_factory=dict) @@ -187,6 +189,9 @@ def test_get_kwargs(): assert kwargs["list_literal"]["type"] is int assert kwargs["list_literal"]["nargs"] == "+" assert kwargs["list_literal"]["choices"] == [1, 2] + # lists with unions should become str type. + # If not, we cannot know which type to use for parsing + assert kwargs["list_union"]["type"] is str # literals of literals should have merged choices assert kwargs["literal_literal"]["choices"] == [1, 2] # dict should have json tip in help diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index efa077a88270a..f938f19b90469 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -217,10 +217,12 @@ Additionally, list elements can be passed individually using `+`: elif contains_type(type_hints, list): type_hint = get_type(type_hints, list) types = get_args(type_hint) - assert len(types) == 1, ( - "List type must have exactly one type. Got " - f"{type_hint} with types {types}") - kwargs[name]["type"] = types[0] + list_type = types[0] + if get_origin(list_type) is Union: + msg = "List type must contain str if it is a Union." + assert str in get_args(list_type), msg + list_type = str + kwargs[name]["type"] = list_type kwargs[name]["nargs"] = "+" elif contains_type(type_hints, int): kwargs[name]["type"] = int From f81c1bb05504672ddd66905161c6ada549fd4b85 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 1 Aug 2025 08:28:45 -0400 Subject: [PATCH 25/54] [Bugfix] Check NVIDIA artifactory is accessible before using flashinfer cubin kernels (#21893) --- vllm/attention/backends/flashinfer.py | 46 +------------- vllm/utils/flashinfer.py | 81 +++++++++++++++++++++++- vllm/v1/attention/backends/flashinfer.py | 49 +------------- vllm/v1/attention/backends/mla/common.py | 16 ++--- 4 files changed, 93 insertions(+), 99 deletions(-) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 824ff8cca201a..b3372ce2eca8c 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -44,9 +44,9 @@ from vllm.attention.layer import Attention from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) +from vllm.utils.flashinfer import use_trtllm_decode_attention logger = init_logger(__name__) @@ -56,7 +56,6 @@ if TYPE_CHECKING: class FlashInferBackend(AttentionBackend): - cached_sm100a_supported: Optional[bool] = None @staticmethod def get_name() -> str: @@ -123,47 +122,6 @@ class FlashInferBackend(AttentionBackend): else: raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") - @staticmethod - def use_trtllm_decode_attention( - batch_size: int, - max_seq_len: int, - kv_cache_dtype: str, - num_qo_heads: Optional[int], - num_kv_heads: Optional[int], - attn_head_size: Optional[int], - ) -> bool: - if FlashInferBackend.cached_sm100a_supported is None: - FlashInferBackend.cached_sm100a_supported = ( - current_platform.has_device_capability(100)) - if not FlashInferBackend.cached_sm100a_supported: - return False - # Check if the dimensions are supported by TRTLLM decode attention - if (attn_head_size is None or num_qo_heads is None - or num_kv_heads is None or num_qo_heads // num_kv_heads > 8 - or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): - return False - env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION - if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", - env_value) - # Environment variable is set - respect it - # Making the conditional check for zero because - # the path is automatically enabled if the batch size condition - # is satisfied. - no_use_trtllm = (env_value == "0") - if not no_use_trtllm: - logger.info_once("Using TRTLLM decode attention.") - return not no_use_trtllm - else: - # Environment variable not set - use auto-detection - use_trtllm = (FlashInferBackend.cached_sm100a_supported - and batch_size <= 256 and max_seq_len < 131072 - and kv_cache_dtype == "auto") - if use_trtllm: - logger.warning_once( - "Using TRTLLM decode attention (auto-detected).") - return use_trtllm - @dataclass class PerLayerParameters: @@ -1156,7 +1114,7 @@ class FlashInferImpl(AttentionImpl): assert decode_meta.decode_wrapper._sm_scale == softmax_scale # TODO: @pavanimajety Remove this once the switch happens # inside flashinfer. - if not FlashInferBackend.use_trtllm_decode_attention( + if not use_trtllm_decode_attention( num_decode_tokens, attn_metadata.max_decode_seq_len, kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 3bfb9808c0a00..29967bc516715 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -10,12 +10,25 @@ import contextlib import functools import importlib import importlib.util -from typing import Any, Callable, NoReturn +import os +from typing import Any, Callable, NoReturn, Optional +import requests + +import vllm.envs as envs from vllm.logger import init_logger +from vllm.platforms import current_platform logger = init_logger(__name__) +# This is the storage path for the cubins, it can be replaced +# with a local path for testing. +# Referenced from https://github.com/flashinfer-ai/flashinfer/blob/0c9a92c3d9a7e043ab6f3f7b2273269caf6ab044/flashinfer/jit/cubin_loader.py#L35 # noqa: E501 +FLASHINFER_CUBINS_REPOSITORY = os.environ.get( + "FLASHINFER_CUBINS_REPOSITORY", + "https://edge.urm.nvidia.com/artifactory/sw-kernelinferencelibrary-public-generic-local/", # noqa: E501 +) + @functools.cache def has_flashinfer() -> bool: @@ -108,6 +121,70 @@ def has_flashinfer_cutlass_fused_moe() -> bool: return True +@functools.cache +def has_nvidia_artifactory() -> bool: + """Return ``True`` if NVIDIA's artifactory is accessible. + + This checks connectivity to the kernel inference library artifactory + which is required for downloading certain cubin kernels like TRTLLM FHMA. + """ + try: + # Use a short timeout to avoid blocking for too long + response = requests.get(FLASHINFER_CUBINS_REPOSITORY, timeout=5) + accessible = response.status_code == 200 + if accessible: + logger.debug_once("NVIDIA artifactory is accessible") + else: + logger.warning_once( + "NVIDIA artifactory returned failed status code: %d", + response.status_code) + return accessible + except Exception as e: + logger.warning_once("Failed to connect to NVIDIA artifactory: %s", e) + return False + + +def use_trtllm_decode_attention( + num_tokens: int, + max_seq_len: int, + kv_cache_dtype: str, + num_qo_heads: Optional[int], + num_kv_heads: Optional[int], + attn_head_size: Optional[int], +) -> bool: + # Requires SM100 and NVIDIA artifactory to be accessible to download cubins + if not (current_platform.is_device_capability(100) + and has_nvidia_artifactory()): + return False + + # Check if the dimensions are supported by TRTLLM decode attention + if (attn_head_size is None or num_qo_heads is None or num_kv_heads is None + or num_qo_heads // num_kv_heads > 8 + or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): + return False + + env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION + if env_value is not None: + logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", + env_value) + # Environment variable is set - respect it + # Making the conditional check for zero because + # the path is automatically enabled if the batch size condition + # is satisfied. + no_use_trtllm = (env_value == "0") + if not no_use_trtllm: + logger.info_once("Using TRTLLM decode attention.") + return not no_use_trtllm + else: + # Environment variable not set - use auto-detection + use_trtllm = (num_tokens <= 256 and max_seq_len < 131072 + and kv_cache_dtype == "auto") + if use_trtllm: + logger.warning_once( + "Using TRTLLM decode attention (auto-detected).") + return use_trtllm + + __all__ = [ "has_flashinfer", "flashinfer_trtllm_fp8_block_scale_moe", @@ -117,4 +194,6 @@ __all__ = [ "autotune", "has_flashinfer_moe", "has_flashinfer_cutlass_fused_moe", + "has_nvidia_artifactory", + "use_trtllm_decode_attention", ] diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 27552f0e7c1ef..f8af1d7e41831 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -17,8 +17,8 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionType) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils import cdiv +from vllm.utils.flashinfer import use_trtllm_decode_attention from vllm.v1.attention.backends.flash_attn import use_cascade_attention from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout, @@ -38,7 +38,6 @@ logger = init_logger(__name__) class FlashInferBackend(AttentionBackend): accept_output_buffer: bool = True - cached_sm100a_supported: Optional[bool] = None @classmethod def get_supported_dtypes(cls) -> list[torch.dtype]: @@ -98,48 +97,6 @@ class FlashInferBackend(AttentionBackend): raise ValueError(f"Unknown cache layout format {cache_layout}.") return stride_order - @staticmethod - def use_trtllm_decode_attention( - batch_size: int, - max_seq_len: int, - kv_cache_dtype: str, - num_qo_heads: int, - num_kv_heads: int, - attn_head_size: int, - ) -> bool: - if FlashInferBackend.cached_sm100a_supported is None: - FlashInferBackend.cached_sm100a_supported = ( - current_platform.has_device_capability(100)) - if not FlashInferBackend.cached_sm100a_supported: - return False - if (num_qo_heads // num_kv_heads > 8 - or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128): - return False - env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION - if env_value is not None: - logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s", - env_value) - # Environment variable is set - respect it - # Making the conditional check for zero because - # the path is automatically enabled if the batch size condition - # is satisfied. - no_use_trtllm = env_value == "0" - if not no_use_trtllm: - logger.info_once( - "VLLM_USE_TRTLLM_DECODE_ATTENTION is set to 1, " - "using TRTLLM decode attention.") - return not no_use_trtllm - else: - # Environment variable not set - use auto-detection - # Only supports attention head size of 128 - use_trtllm = (FlashInferBackend.cached_sm100a_supported - and batch_size <= 256 and max_seq_len < 131072 - and kv_cache_dtype == "auto") - if use_trtllm: - logger.warning_once( - "Using TRTLLM decode attention (auto-detected).") - return use_trtllm - @staticmethod def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype: if kv_cache_dtype in ("fp8", "fp8_e4m3"): @@ -352,7 +309,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): if num_decodes > 0: attn_metadata.decode_wrapper = self._get_decode_wrapper() - if not FlashInferBackend.use_trtllm_decode_attention( + if not use_trtllm_decode_attention( num_decodes, attn_metadata.max_seq_len, self.cache_config.cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, @@ -636,7 +593,7 @@ class FlashInferImpl(AttentionImpl): decode_query = query[:num_decode_tokens] assert decode_query.shape[0] == num_decode_tokens assert decode_wrapper is not None - if not FlashInferBackend.use_trtllm_decode_attention( + if not use_trtllm_decode_attention( attn_metadata.num_decodes, attn_metadata.max_seq_len, self.kv_cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 0095d75217856..d112468f1c91d 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -209,6 +209,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, UnquantizedLinearMethod) from vllm.platforms import current_platform from vllm.utils import cdiv, round_down +from vllm.utils.flashinfer import has_nvidia_artifactory from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, get_per_layer_parameters, infer_global_hyperparameters, @@ -379,17 +380,16 @@ M = TypeVar("M", bound=MLACommonMetadata) def use_flashinfer_prefill() -> bool: - if flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL: - # For blackwell default to flashinfer prefill if its available since - # its faster than FA2. - return current_platform.has_device_capability(100) - return False + # For blackwell default to flashinfer prefill if its available since + # it is faster than FA2. + return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL + and current_platform.is_device_capability(100)) def use_cudnn_prefill() -> bool: - if flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL: - return current_platform.has_device_capability(100) - return False + return (flashinfer_available and envs.VLLM_USE_CUDNN_PREFILL + and current_platform.is_device_capability(100) + and has_nvidia_artifactory()) # Currently 394MB, this can be tuned based on GEMM sizes used. From 0a6d305e0f7b63b06c87bb1f7564ae8d148a3311 Mon Sep 17 00:00:00 2001 From: Gamhang Date: Fri, 1 Aug 2025 21:07:33 +0800 Subject: [PATCH 26/54] feat(multimodal): Add customizable background color for RGBA to RGB conversion (#22052) Signed-off-by: Jinheng Li Co-authored-by: Jinheng Li --- docs/features/multimodal_inputs.md | 44 +++++++++++ tests/multimodal/test_image.py | 115 ++++++++++++++++++++++++++++- vllm/multimodal/image.py | 37 ++++++++-- 3 files changed, 190 insertions(+), 6 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index b8677f11a1d3c..cdd32924b5668 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -172,6 +172,36 @@ Multi-image input can be extended to perform video captioning. We show this with print(generated_text) ``` +#### Custom RGBA Background Color + +When loading RGBA images (images with transparency), vLLM converts them to RGB format. By default, transparent pixels are replaced with white background. You can customize this background color using the `rgba_background_color` parameter in `media_io_kwargs`. + +??? code + + ```python + from vllm import LLM + + # Default white background (no configuration needed) + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + # Custom black background for dark theme + llm = LLM( + model="llava-hf/llava-1.5-7b-hf", + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} + ) + + # Custom brand color background (e.g., blue) + llm = LLM( + model="llava-hf/llava-1.5-7b-hf", + media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} + ) + ``` + +!!! note + - The `rgba_background_color` accepts RGB values as a list `[R, G, B]` or tuple `(R, G, B)` where each value is 0-255 + - This setting only affects RGBA images with transparency; RGB images are unchanged + - If not specified, the default white background `(255, 255, 255)` is used for backward compatibility + ### Video Inputs You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary @@ -478,6 +508,20 @@ Full example: ``` +#### Custom RGBA Background Color + +To use a custom background color for RGBA images, pass the `rgba_background_color` parameter via `--media-io-kwargs`: + +```bash +# Example: Black background for dark theme +vllm serve llava-hf/llava-1.5-7b-hf \ + --media-io-kwargs '{"image": {"rgba_background_color": [0, 0, 0]}}' + +# Example: Custom gray background +vllm serve llava-hf/llava-1.5-7b-hf \ + --media-io-kwargs '{"image": {"rgba_background_color": [128, 128, 128]}}' +``` + ### Audio Inputs Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in). diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py index cfd44351a6d1f..271a85f1195ec 100644 --- a/tests/multimodal/test_image.py +++ b/tests/multimodal/test_image.py @@ -3,9 +3,10 @@ from pathlib import Path import numpy as np +import pytest from PIL import Image, ImageChops -from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.image import ImageMediaIO, convert_image_mode ASSETS_DIR = Path(__file__).parent / "assets" assert ASSETS_DIR.exists() @@ -35,3 +36,115 @@ def test_rgba_to_rgb(): assert converted_image_numpy[i][j][0] == 255 assert converted_image_numpy[i][j][1] == 255 assert converted_image_numpy[i][j][2] == 255 + + +def test_rgba_to_rgb_custom_background(tmp_path): + """Test RGBA to RGB conversion with custom background colors.""" + # Create a simple RGBA image with transparent and opaque pixels + rgba_image = Image.new("RGBA", (10, 10), + (255, 0, 0, 255)) # Red with full opacity + + # Make top-left quadrant transparent + for i in range(5): + for j in range(5): + rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent + + # Save the test image to tmp_path + test_image_path = tmp_path / "test_rgba.png" + rgba_image.save(test_image_path) + + # Test 1: Default white background (backward compatibility) + image_io_default = ImageMediaIO() + converted_default = image_io_default.load_file(test_image_path) + default_numpy = np.array(converted_default) + + # Check transparent pixels are white + assert default_numpy[0][0][0] == 255 # R + assert default_numpy[0][0][1] == 255 # G + assert default_numpy[0][0][2] == 255 # B + # Check opaque pixels remain red + assert default_numpy[5][5][0] == 255 # R + assert default_numpy[5][5][1] == 0 # G + assert default_numpy[5][5][2] == 0 # B + + # Test 2: Custom black background via kwargs + image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0)) + converted_black = image_io_black.load_file(test_image_path) + black_numpy = np.array(converted_black) + + # Check transparent pixels are black + assert black_numpy[0][0][0] == 0 # R + assert black_numpy[0][0][1] == 0 # G + assert black_numpy[0][0][2] == 0 # B + # Check opaque pixels remain red + assert black_numpy[5][5][0] == 255 # R + assert black_numpy[5][5][1] == 0 # G + assert black_numpy[5][5][2] == 0 # B + + # Test 3: Custom blue background via kwargs (as list) + image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255]) + converted_blue = image_io_blue.load_file(test_image_path) + blue_numpy = np.array(converted_blue) + + # Check transparent pixels are blue + assert blue_numpy[0][0][0] == 0 # R + assert blue_numpy[0][0][1] == 0 # G + assert blue_numpy[0][0][2] == 255 # B + + # Test 4: Test with load_bytes method + with open(test_image_path, 'rb') as f: + image_data = f.read() + + image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0)) + converted_green = image_io_green.load_bytes(image_data) + green_numpy = np.array(converted_green) + + # Check transparent pixels are green + assert green_numpy[0][0][0] == 0 # R + assert green_numpy[0][0][1] == 255 # G + assert green_numpy[0][0][2] == 0 # B + + +def test_rgba_background_color_validation(): + """Test that invalid rgba_background_color values are properly rejected.""" + + # Test invalid types + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color="255,255,255") + + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=255) + + # Test wrong number of elements + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255, 255)) + + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255, 255, 255, 255)) + + # Test non-integer values + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0)) + + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255, "255", 255)) + + # Test out of range values + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(256, 255, 255)) + + with pytest.raises(ValueError, + match="rgba_background_color must be a list or tuple"): + ImageMediaIO(rgba_background_color=(255, -1, 255)) + + # Test that valid values work + ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise + ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise + ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index a0448a80ac7c2..1006c1ce4b241 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -3,6 +3,7 @@ from io import BytesIO from pathlib import Path +from typing import Union import pybase64 import torch @@ -23,9 +24,10 @@ def rescale_image_size(image: Image.Image, return image -# TODO: Support customizable background color to fill in. def rgba_to_rgb( - image: Image.Image, background_color=(255, 255, 255)) -> Image.Image: + image: Image.Image, + background_color: Union[tuple[int, int, int], list[int]] = (255, 255, 255) +) -> Image.Image: """Convert an RGBA image to RGB with filled background color.""" assert image.mode == "RGBA" converted = Image.new("RGB", image.size, background_color) @@ -55,10 +57,35 @@ class ImageMediaIO(MediaIO[Image.Image]): # for flexible control. self.kwargs = kwargs + # Extract RGBA background color from kwargs if provided + # Default to white background for backward compatibility + rgba_bg = kwargs.get('rgba_background_color', (255, 255, 255)) + # Convert list to tuple for consistency + if isinstance(rgba_bg, list): + rgba_bg = tuple(rgba_bg) + + # Validate rgba_background_color format + if not (isinstance(rgba_bg, tuple) and len(rgba_bg) == 3 + and all(isinstance(c, int) and 0 <= c <= 255 + for c in rgba_bg)): + raise ValueError( + "rgba_background_color must be a list or tuple of 3 integers " + "in the range [0, 255].") + self.rgba_background_color = rgba_bg + + def _convert_image_mode(self, image: Image.Image) -> Image.Image: + """Convert image mode with custom background color.""" + if image.mode == self.image_mode: + return image + elif image.mode == "RGBA" and self.image_mode == "RGB": + return rgba_to_rgb(image, self.rgba_background_color) + else: + return convert_image_mode(image, self.image_mode) + def load_bytes(self, data: bytes) -> Image.Image: image = Image.open(BytesIO(data)) image.load() - return convert_image_mode(image, self.image_mode) + return self._convert_image_mode(image) def load_base64(self, media_type: str, data: str) -> Image.Image: return self.load_bytes(pybase64.b64decode(data, validate=True)) @@ -66,7 +93,7 @@ class ImageMediaIO(MediaIO[Image.Image]): def load_file(self, filepath: Path) -> Image.Image: image = Image.open(filepath) image.load() - return convert_image_mode(image, self.image_mode) + return self._convert_image_mode(image) def encode_base64( self, @@ -77,7 +104,7 @@ class ImageMediaIO(MediaIO[Image.Image]): image = media with BytesIO() as buffer: - image = convert_image_mode(image, self.image_mode) + image = self._convert_image_mode(image) image.save(buffer, image_format) data = buffer.getvalue() From 5c54d9759d3e12d66919826bf1b7c196914d3a92 Mon Sep 17 00:00:00 2001 From: Abirdcfly Date: Fri, 1 Aug 2025 21:08:45 +0800 Subject: [PATCH 27/54] [Bugfix][PD] set max_completion_tokens=1 if req has this value (#21841) Signed-off-by: Abirdcfly --- .../online_serving/disaggregated_serving/disagg_proxy_demo.py | 2 ++ .../disagg_proxy_p2p_nccl_xpyd.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index 16c32dcaa5d31..d39edb0b9d15c 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -293,6 +293,8 @@ class Proxy: # add params to request kv_prepare_request = request.copy() kv_prepare_request["max_tokens"] = 1 + if "max_completion_tokens" in kv_prepare_request: + kv_prepare_request["max_completion_tokens"] = 1 # prefill stage prefill_instance = self.schedule(self.prefill_cycler) diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py index a6fd92feb2f11..73da7af85f1d9 100644 --- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py +++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py @@ -128,6 +128,8 @@ async def handle_request(): prefill_request = original_request_data.copy() # change max_tokens = 1 to let it only do prefill prefill_request["max_tokens"] = 1 + if "max_completion_tokens" in prefill_request: + prefill_request["max_completion_tokens"] = 1 global count global prefill_instances From a59cd9d9f7fd89e19beeffb7e7f89437d413eafb Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 1 Aug 2025 09:10:30 -0400 Subject: [PATCH 28/54] [Refactor] Fix Compile Warning #1444-D (#21462) Signed-off-by: yewentao256 --- csrc/moe/topk_softmax_kernels.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 0b505d2e04a21..7a7865b901de1 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -24,9 +24,12 @@ #ifndef USE_ROCM #include #include + #include + using AddOp = cuda::std::plus; #else #include #include + using AddOp = cub::Sum; #endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -62,7 +65,6 @@ __launch_bounds__(TPB) __global__ const int thread_row_offset = blockIdx.x * num_cols; - cub::Sum sum; float threadData(-FLT_MAX); // Don't touch finished rows. @@ -92,7 +94,7 @@ __launch_bounds__(TPB) __global__ threadData += exp((static_cast(input[idx]) - float_max)); } - const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum); + const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp()); if (threadIdx.x == 0) { From 8026a335a135af2e53c7d89652863312d7a3c936 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Fri, 1 Aug 2025 10:11:29 -0400 Subject: [PATCH 29/54] [BugFix] Update AttnFusionPass cache key (#21947) Signed-off-by: Richard Zou --- vllm/compilation/fusion_attn.py | 3 +++ vllm/compilation/inductor_pass.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py index 79518b6f4f965..a40a8caf34a88 100644 --- a/vllm/compilation/fusion_attn.py +++ b/vllm/compilation/fusion_attn.py @@ -164,3 +164,6 @@ class AttnFusionPass(VllmInductorPass): logger.debug("Fused quantization onto %s attention nodes", count) self.dump_graph(graph, "after_attn_fusion") self.end_and_log() + + def uuid(self): + return VllmInductorPass.hash_source(self, AttentionStaticQuantPattern) diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 810d0801e9f38..2a149c65b3877 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -76,9 +76,10 @@ class InductorPass(CustomGraphPass): for src in srcs: if isinstance(src, str): src_str = src - elif isinstance(src, types.FunctionType): + elif isinstance(src, (types.FunctionType, type)): src_str = inspect.getsource(src) else: + # object instance src_str = inspect.getsource(src.__class__) hasher.update(src_str.encode("utf-8")) return hasher.hexdigest() From 3146519add735bc51a6a983af9e9c4a8b8d3373e Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 1 Aug 2025 15:37:55 +0100 Subject: [PATCH 30/54] [BugFix] Don't change title of top-level process (#22032) Signed-off-by: Nick Hill --- vllm/entrypoints/cli/serve.py | 11 ++++++----- vllm/entrypoints/openai/api_server.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 0305354a66e85..9762a1de9edd3 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -18,7 +18,8 @@ from vllm.entrypoints.utils import (VLLM_SUBCMD_PARSER_EPILOG, show_filtered_argument_or_group_from_help) from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, decorate_logs, get_tcp_uri +from vllm.utils import (FlexibleArgumentParser, decorate_logs, get_tcp_uri, + set_process_title) from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines from vllm.v1.executor.abstract import Executor @@ -74,7 +75,7 @@ def run_headless(args: argparse.Namespace): if args.api_server_count > 1: raise ValueError("api_server_count can't be set in headless mode") - # set_process_title("Headless_ProcManager") + # Create the EngineConfig. engine_args = vllm.AsyncEngineArgs.from_cli_args(args) usage_context = UsageContext.OPENAI_API_SERVER @@ -139,8 +140,6 @@ def run_multi_api_server(args: argparse.Namespace): orig_disable_mm_preprocessor_cache = args.disable_mm_preprocessor_cache - # set_process_title("ProcManager") - if num_api_servers > 1: setup_multiprocess_prometheus() @@ -225,7 +224,9 @@ def run_api_server_worker_proc(listen_address, **uvicorn_kwargs) -> None: """Entrypoint for individual API server worker processes.""" - # Add process-specific prefix to stdout and stderr. + # Set process title and add process-specific prefix to stdout and stderr. + server_index = client_config.get("client_index", 0) if client_config else 0 + set_process_title("APIServer", str(server_index)) decorate_logs() uvloop.run( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 26db1357da4d0..1be03c57a1f1b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -102,7 +102,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs, get_open_zmq_ipc_path, is_valid_ipv6_address, - set_process_title, set_ulimit) + set_ulimit) from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION @@ -1824,7 +1824,7 @@ async def run_server_worker(listen_address, ToolParserManager.import_tool_parser(args.tool_parser_plugin) server_index = client_config.get("client_index", 0) if client_config else 0 - set_process_title("APIServer", str(server_index)) + # Load logging config for uvicorn if specified log_config = load_log_config(args.log_config_file) if log_config is not None: From 97608dc276c292d9217eb6d334d969c5e89913c6 Mon Sep 17 00:00:00 2001 From: David Xia Date: Fri, 1 Aug 2025 10:55:55 -0400 Subject: [PATCH 31/54] [Docs] use `uv` in CPU installation docs (#22089) Signed-off-by: David Xia --- .../installation/cpu/apple.inc.md | 12 ++--- .../installation/cpu/build.inc.md | 22 +++++---- .../installation/cpu/s390x.inc.md | 45 ++++++++++++------- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md index 0816f38ac68a1..2828173a76a9a 100644 --- a/docs/getting_started/installation/cpu/apple.inc.md +++ b/docs/getting_started/installation/cpu/apple.inc.md @@ -1,6 +1,6 @@ # --8<-- [start:installation] -vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS. +vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS. Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. @@ -23,20 +23,20 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. +After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from source. ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -pip install -r requirements/cpu.txt -pip install -e . +uv pip install -r requirements/cpu.txt +uv pip install -e . ``` !!! note - On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. + On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device. !!! example "Troubleshooting" - If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your + If the build fails with errors like the following where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). ```text diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index fa777fe0c8a1a..57a09e674a821 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -1,4 +1,4 @@ -First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: +First, install the recommended compiler. We recommend using `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: ```bash sudo apt-get update -y @@ -6,28 +6,34 @@ sudo apt-get install -y --no-install-recommends ccache git curl wget ca-certific sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` -Second, clone vLLM project: +Second, clone the vLLM project: ```bash git clone https://github.com/vllm-project/vllm.git vllm_source cd vllm_source ``` -Third, install Python packages for vLLM CPU backend building: +Third, install required dependencies: ```bash -pip install --upgrade pip -pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu -pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +uv pip install -r requirements/cpu-build.txt --torch-backend auto +uv pip install -r requirements/cpu.txt --torch-backend auto ``` -Finally, build and install vLLM CPU backend: +??? console "pip" + ```bash + pip install --upgrade pip + pip install -v -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +Finally, build and install vLLM: ```bash VLLM_TARGET_DEVICE=cpu python setup.py install ``` -If you want to develop vllm, install it in editable mode instead. +If you want to develop vLLM, install it in editable mode instead. ```bash VLLM_TARGET_DEVICE=cpu python setup.py develop diff --git a/docs/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md index acfb3396896bf..c1917267ce91b 100644 --- a/docs/getting_started/installation/cpu/s390x.inc.md +++ b/docs/getting_started/installation/cpu/s390x.inc.md @@ -1,6 +1,6 @@ # --8<-- [start:installation] -vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform. +vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform. Currently the CPU implementation for s390x architecture supports FP32 datatype only. @@ -40,21 +40,32 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \ . "$HOME/.cargo/env" ``` -Execute the following commands to build and install vLLM from the source. +Execute the following commands to build and install vLLM from source. !!! tip - Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM. + Please build the following dependencies, `torchvision`, `pyarrow` from source before building vLLM. ```bash sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds - pip install -v \ - --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + uv pip install -v \ + --torch-backend auto \ -r requirements-build.txt \ -r requirements-cpu.txt \ VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ - pip install dist/*.whl + uv pip install dist/*.whl ``` +??? console "pip" + ```bash + sed -i '/^torch/d' requirements-build.txt # remove torch from requirements-build.txt since we use nightly builds + pip install -v \ + --extra-index-url https://download.pytorch.org/whl/nightly/cpu \ + -r requirements-build.txt \ + -r requirements-cpu.txt \ + VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \ + pip install dist/*.whl + ``` + # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] @@ -63,19 +74,19 @@ Execute the following commands to build and install vLLM from the source. ```bash docker build -f docker/Dockerfile.s390x \ - --tag vllm-cpu-env . + --tag vllm-cpu-env . -# Launching OpenAI server +# Launch OpenAI server docker run --rm \ - --privileged=true \ - --shm-size=4g \ - -p 8000:8000 \ - -e VLLM_CPU_KVCACHE_SPACE= \ - -e VLLM_CPU_OMP_THREADS_BIND= \ - vllm-cpu-env \ - --model=meta-llama/Llama-3.2-1B-Instruct \ - --dtype=float \ - other vLLM OpenAI server arguments + --privileged true \ + --shm-size 4g \ + -p 8000:8000 \ + -e VLLM_CPU_KVCACHE_SPACE= \ + -e VLLM_CPU_OMP_THREADS_BIND= \ + vllm-cpu-env \ + --model meta-llama/Llama-3.2-1B-Instruct \ + --dtype float \ + other vLLM OpenAI server arguments ``` # --8<-- [end:build-image-from-source] From 2d7b09b998980b9ccbb3708632b47bc28de076aa Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 17:16:37 +0100 Subject: [PATCH 32/54] Deprecate `--disable-log-requests` and replace with `--enable-log-requests` (#21739) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/nightly-benchmarks/README.md | 1 - .../tests/genai-perf-tests.json | 1 - .../tests/nightly-tests.json | 6 ---- .../tests/serving-tests-cpu-snc2.json | 6 ---- .../tests/serving-tests-cpu-snc3.json | 6 ---- .../tests/serving-tests-cpu.json | 5 ---- .../tests/serving-tests.json | 6 +--- tests/config/test_mp_reducer.py | 1 - tests/mq_llm_engine/test_load.py | 2 +- tests/v1/engine/test_async_llm.py | 4 +-- tests/v1/test_async_llm_dp.py | 1 - vllm/engine/arg_utils.py | 30 ++++++++++++++++--- vllm/engine/async_llm_engine.py | 26 +++++++++------- vllm/engine/multiprocessing/engine.py | 27 ++++++++++++----- vllm/entrypoints/openai/api_server.py | 12 ++++---- vllm/entrypoints/openai/run_batch.py | 6 ++-- vllm/utils/__init__.py | 6 ++++ vllm/v1/engine/async_llm.py | 30 +++++++++++-------- 18 files changed, 97 insertions(+), 79 deletions(-) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index fcde284efea98..3721d3d1d6749 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -104,7 +104,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { diff --git a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json index edbe9f2df0ce0..f26ae7634f3d9 100644 --- a/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json +++ b/.buildkite/nightly-benchmarks/tests/genai-perf-tests.json @@ -11,7 +11,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index fda1a7a3ec53c..41b4a4008801d 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -35,7 +35,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -90,7 +89,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -145,7 +143,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -197,7 +194,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -251,7 +247,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, @@ -305,7 +300,6 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", "gpu_memory_utilization": 0.9, "num_scheduler_steps": 10, "max_num_seqs": 512, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json index a144b4420fbf1..dd0e24edff98d 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json @@ -17,7 +17,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -50,7 +49,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -83,7 +81,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -117,7 +114,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -153,7 +149,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -189,7 +184,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index e6e69b63b74df..f1bda65a7590b 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -17,7 +17,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -50,7 +49,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -84,7 +82,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -118,7 +115,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -154,7 +150,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -191,7 +186,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index ce1f924de387f..f150b9abeea45 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -17,7 +17,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -50,7 +49,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -83,7 +81,6 @@ "block_size": 128, "trust_remote_code": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -117,7 +114,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, @@ -153,7 +149,6 @@ "trust_remote_code": "", "enable_chunked_prefill": "", "disable_log_stats": "", - "disable_log_requests": "", "enforce_eager": "", "max_num_batched_tokens": 2048, "max_num_seqs": 256, diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json index 13fd5aa8db97b..a6d4141d5c2dc 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -7,7 +7,6 @@ "tensor_parallel_size": 1, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -26,7 +25,6 @@ "tensor_parallel_size": 4, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -45,7 +43,6 @@ "tensor_parallel_size": 2, "swap_space": 16, "disable_log_stats": "", - "disable_log_requests": "", "load_format": "dummy" }, "client_parameters": { @@ -60,8 +57,7 @@ "test_name": "serving_llama70B_tp4_sharegpt_specdecode", "qps_list": [2], "server_parameters": { - "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "disable_log_requests": "", + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", "tensor_parallel_size": 4, "swap_space": 16, "speculative_config": { diff --git a/tests/config/test_mp_reducer.py b/tests/config/test_mp_reducer.py index ee351cbfa7c16..d4d4be293280b 100644 --- a/tests/config/test_mp_reducer.py +++ b/tests/config/test_mp_reducer.py @@ -28,7 +28,6 @@ def test_mp_reducer(monkeypatch): max_model_len=32, gpu_memory_utilization=0.1, disable_log_stats=True, - disable_log_requests=True, ) async_llm = AsyncLLM.from_engine_args( diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py index e9fd5b814f285..c934706611ae3 100644 --- a/tests/mq_llm_engine/test_load.py +++ b/tests/mq_llm_engine/test_load.py @@ -16,7 +16,7 @@ NUM_EXPECTED_TOKENS = 10 NUM_REQUESTS = 10000 # Scenarios to test for num generated token. -ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True) +ENGINE_ARGS = AsyncEngineArgs(model=MODEL) @pytest.fixture(scope="function") diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 412df3acff126..21694491dd73a 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -26,12 +26,10 @@ if not current_platform.is_cuda(): TEXT_ENGINE_ARGS = AsyncEngineArgs( model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, - disable_log_requests=True, ) VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct", - enforce_eager=True, - disable_log_requests=True) + enforce_eager=True) TEXT_PROMPT = "Hello my name is Robert and" diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 6716d27f571f9..c2610a87ac780 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -25,7 +25,6 @@ DP_SIZE = int(os.getenv("DP_SIZE", 2)) engine_args = AsyncEngineArgs( model="ibm-research/PowerMoE-3b", enforce_eager=True, - disable_log_requests=True, tensor_parallel_size=int(os.getenv("TP_SIZE", 1)), data_parallel_size=DP_SIZE, ) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f938f19b90469..0d38b5b5302c1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,7 @@ from typing import (TYPE_CHECKING, Annotated, Any, Callable, Dict, List, import regex as re import torch from pydantic import TypeAdapter, ValidationError -from typing_extensions import TypeIs +from typing_extensions import TypeIs, deprecated import vllm.envs as envs from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig, @@ -1704,7 +1704,23 @@ class EngineArgs: @dataclass class AsyncEngineArgs(EngineArgs): """Arguments for asynchronous vLLM engine.""" - disable_log_requests: bool = False + enable_log_requests: bool = False + + @property + @deprecated( + "`disable_log_requests` is deprecated and has been replaced with " + "`enable_log_requests`. This will be removed in v0.12.0. Please use " + "`enable_log_requests` instead.") + def disable_log_requests(self) -> bool: + return not self.enable_log_requests + + @disable_log_requests.setter + @deprecated( + "`disable_log_requests` is deprecated and has been replaced with " + "`enable_log_requests`. This will be removed in v0.12.0. Please use " + "`enable_log_requests` instead.") + def disable_log_requests(self, value: bool): + self.enable_log_requests = not value @staticmethod def add_cli_args(parser: FlexibleArgumentParser, @@ -1715,9 +1731,15 @@ class AsyncEngineArgs(EngineArgs): load_general_plugins() if not async_args_only: parser = EngineArgs.add_cli_args(parser) + parser.add_argument('--enable-log-requests', + action=argparse.BooleanOptionalAction, + default=AsyncEngineArgs.enable_log_requests, + help='Enable logging requests.') parser.add_argument('--disable-log-requests', - action='store_true', - help='Disable logging requests.') + action=argparse.BooleanOptionalAction, + default=not AsyncEngineArgs.enable_log_requests, + help='[DEPRECATED] Disable logging requests.', + deprecated=True) current_platform.pre_register_and_update(parser) return parser diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 06bb4eeab69eb..1f962b008ee03 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -30,7 +30,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, weak_bind +from vllm.utils import Device, deprecate_kwargs, weak_bind logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -554,14 +554,20 @@ class AsyncLLMEngine(EngineClient): return LLMEngine._get_executor_cls(engine_config) @classmethod + @deprecate_kwargs( + "disable_log_requests", + additional_message=("This argument will have no effect. " + "Use `enable_log_requests` instead."), + ) def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, - disable_log_requests: bool = False, - disable_log_stats: bool = False, + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[dict[str, StatLoggerBase]] = None, + enable_log_requests: bool = False, + disable_log_stats: bool = False, + disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLMEngine": """Create an AsyncLLMEngine from the EngineArgs.""" @@ -569,7 +575,7 @@ class AsyncLLMEngine(EngineClient): vllm_config=vllm_config, executor_class=cls._get_executor_cls(vllm_config), start_engine_loop=start_engine_loop, - log_requests=not disable_log_requests, + log_requests=enable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, stat_loggers=stat_loggers, @@ -598,7 +604,7 @@ class AsyncLLMEngine(EngineClient): usage_context=usage_context, stat_loggers=stat_loggers, disable_log_stats=engine_args.disable_log_stats, - disable_log_requests=engine_args.disable_log_requests, + enable_log_requests=engine_args.enable_log_requests, ) @property diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index fe6eb0d8c2f1a..903f3fd71ebcd 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -34,6 +34,7 @@ from vllm.outputs import RequestOutput from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.usage.usage_lib import UsageContext +from vllm.utils import deprecate_kwargs from vllm.worker.model_runner_base import InputProcessingError logger = init_logger(__name__) @@ -120,10 +121,20 @@ class MQLLMEngine: return ENGINE_DEAD_ERROR() @classmethod - def from_vllm_config(cls, vllm_config: VllmConfig, - usage_context: UsageContext, - disable_log_requests: bool, disable_log_stats: bool, - ipc_path: str) -> "MQLLMEngine": + @deprecate_kwargs( + "disable_log_requests", + additional_message=("This argument will have no effect. " + "Use `enable_log_requests` instead."), + ) + def from_vllm_config( + cls, + vllm_config: VllmConfig, + usage_context: UsageContext, + enable_log_requests: bool, + disable_log_stats: bool, + ipc_path: str, + disable_log_requests: bool = True, # Deprecated, will be removed + ) -> "MQLLMEngine": # Setup plugins for each process from vllm.plugins import load_general_plugins load_general_plugins() @@ -136,7 +147,7 @@ class MQLLMEngine: ipc_path=ipc_path, usage_context=usage_context, use_async_sockets=use_async_sockets, - log_requests=(not disable_log_requests), + log_requests=enable_log_requests, log_stats=(not disable_log_stats), ) @@ -150,7 +161,7 @@ class MQLLMEngine: ipc_path=ipc_path, vllm_config=vllm_config, usage_context=usage_context, - disable_log_requests=engine_args.disable_log_requests, + enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats, ) @@ -436,7 +447,7 @@ def signal_handler(*_) -> None: def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, ipc_path: str, disable_log_stats: bool, - disable_log_requests: bool, engine_alive): + enable_log_requests: bool, engine_alive): try: # Ensure we can serialize transformer config before spawning maybe_register_config_serialize_by_value() @@ -445,7 +456,7 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext, vllm_config=vllm_config, usage_context=usage_context, disable_log_stats=disable_log_stats, - disable_log_requests=disable_log_requests, + enable_log_requests=enable_log_requests, ipc_path=ipc_path) signal.signal(signal.SIGTERM, signal_handler) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1be03c57a1f1b..b8ec5461f7719 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -205,7 +205,7 @@ async def build_async_engine_client_from_engine_args( async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, - disable_log_requests=engine_args.disable_log_requests, + enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats, client_addresses=client_config, client_index=client_index) @@ -227,7 +227,7 @@ async def build_async_engine_client_from_engine_args( engine_client = AsyncLLMEngine.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, - disable_log_requests=engine_args.disable_log_requests, + enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats) yield engine_client finally: @@ -272,7 +272,7 @@ async def build_async_engine_client_from_engine_args( target=run_mp_engine, args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path, engine_args.disable_log_stats, - engine_args.disable_log_requests, engine_alive)) + engine_args.enable_log_requests, engine_alive)) engine_process.start() engine_pid = engine_process.pid assert engine_pid is not None, "Engine process failed to start." @@ -1570,10 +1570,10 @@ async def init_app_state( else: served_model_names = [args.model] - if args.disable_log_requests: - request_logger = None - else: + if args.enable_log_requests: request_logger = RequestLogger(max_log_len=args.max_log_len) + else: + request_logger = None base_model_paths = [ BaseModelPath(name=name, model_path=args.model) diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 137b368dad202..d146ad485d194 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -324,10 +324,10 @@ async def run_batch( else: served_model_names = [args.model] - if args.disable_log_requests: - request_logger = None - else: + if args.enable_log_requests: request_logger = RequestLogger(max_log_len=args.max_log_len) + else: + request_logger = None base_model_paths = [ BaseModelPath(name=name, model_path=args.model) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index d5d8d9dad73a8..7405f3986df8d 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1668,6 +1668,12 @@ class FlexibleArgumentParser(ArgumentParser): # Enable the deprecated kwarg for Python 3.12 and below def parse_known_args(self, args=None, namespace=None): + if args is not None and "--disable-log-requests" in args: + # Special case warning because the warning below won't trigger + # if –-disable-log-requests because its value is default. + logger.warning_once( + "argument '--disable-log-requests' is deprecated. This " + "will be removed in v0.12.0.") namespace, args = super().parse_known_args(args, namespace) for action in FlexibleArgumentParser._deprecated: if (hasattr(namespace, dest := action.dest) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ed0d9620f4762..308ca32105ba9 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -27,7 +27,7 @@ from vllm.transformers_utils.config import ( from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import Device, cdiv +from vllm.utils import Device, cdiv, deprecate_kwargs from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError @@ -142,16 +142,22 @@ class AsyncLLM(EngineClient): pass @classmethod + @deprecate_kwargs( + "disable_log_requests", + additional_message=("This argument will have no effect. " + "Use `enable_log_requests` instead."), + ) def from_vllm_config( - cls, - vllm_config: VllmConfig, - start_engine_loop: bool = True, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[list[StatLoggerFactory]] = None, - disable_log_requests: bool = False, - disable_log_stats: bool = False, - client_addresses: Optional[dict[str, str]] = None, - client_index: int = 0, + cls, + vllm_config: VllmConfig, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[list[StatLoggerFactory]] = None, + enable_log_requests: bool = False, + disable_log_stats: bool = False, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, + disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLM": if not envs.VLLM_USE_V1: raise ValueError( @@ -166,7 +172,7 @@ class AsyncLLM(EngineClient): executor_class=Executor.get_class(vllm_config), start_engine_loop=start_engine_loop, stat_loggers=stat_loggers, - log_requests=not disable_log_requests, + log_requests=enable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, client_addresses=client_addresses, @@ -191,7 +197,7 @@ class AsyncLLM(EngineClient): return cls( vllm_config=vllm_config, executor_class=executor_class, - log_requests=not engine_args.disable_log_requests, + log_requests=engine_args.enable_log_requests, log_stats=not engine_args.disable_log_stats, start_engine_loop=start_engine_loop, usage_context=usage_context, From 326a1b001db10afc2dc5b2bfcb60a3b8f8bcb2ac Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 17:32:27 +0100 Subject: [PATCH 33/54] Improve documentation of `ModelConfig.try_get_generation_config` to prevent future confusion (#21526) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 2d61552c5dadc..124d62b699771 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1575,7 +1575,18 @@ class ModelConfig: return self.multimodal_config def try_get_generation_config(self) -> dict[str, Any]: - if self.generation_config in ("auto", "vllm"): + """ + This method attempts to retrieve the non-default values of the + generation config for this model. + + The generation config can contain information about special tokens, as + well as sampling parameters. Which is why this method exists separately + to `get_diff_sampling_param`. + + Returns: + A dictionary containing the non-default generation config. + """ + if self.generation_config in {"auto", "vllm"}: config = try_get_generation_config( self.hf_config_path or self.model, trust_remote_code=self.trust_remote_code, @@ -1594,13 +1605,18 @@ class ModelConfig: def get_diff_sampling_param(self) -> dict[str, Any]: """ - This method returns a dictionary containing the parameters - that differ from the default sampling parameters. If - `generation_config` is `"vllm"`, an empty dictionary is returned. + This method returns a dictionary containing the non-default sampling + parameters with `override_generation_config` applied. + + The default sampling parameters are: + + - vLLM's neutral defaults if `self.generation_config="vllm"` + - the model's defaults if `self.generation_config="auto"` + - as defined in `generation_config.json` if + `self.generation_config="path/to/generation_config/dir"` Returns: - dict[str, Any]: A dictionary with the differing sampling - parameters, if `generation_config` is `"vllm"` an empty dictionary. + A dictionary containing the non-default sampling parameters. """ if self.generation_config == "vllm": config = {} From 3f8e9521791dd3f41c90cc2b3c9e78a1951f5237 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 2 Aug 2025 00:33:30 +0800 Subject: [PATCH 34/54] [Bugfix] Fix glm4.1v video inference issue (#22067) Signed-off-by: Isotr0py <2037008807@qq.com> --- .../multimodal/processing/test_glm4_1v.py | 51 +++++++++++++++++++ vllm/model_executor/models/glm4_1v.py | 8 +-- 2 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 tests/models/multimodal/processing/test_glm4_1v.py diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py new file mode 100644 index 0000000000000..d1c5fa8fec6d2 --- /dev/null +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.assets.video import VideoAsset +from vllm.multimodal import MULTIMODAL_REGISTRY + +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["THUDM/GLM-4.1V-9B-Thinking"]) +@pytest.mark.parametrize("expected_toks_per_frame", [299]) +@pytest.mark.parametrize("num_frames", [32, 128]) +@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)]) +def test_processor_override( + model_id: str, + expected_toks_per_frame: int, + expected_grid_t: int, + fps: int, + num_frames: int, +): + """Ensure GLM4vMultiModalProcessor can handle video frames properly.""" + ctx = build_model_context( + model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"video": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + tokenizer = processor.info.get_tokenizer() + hf_processor_mm_kwargs = {"fps": fps} + + # Build the image str / prompt based on the number of images we pass + video_assets = VideoAsset(name="baby_reading", num_frames=num_frames) + prompt = "<|begin_of_video|><|video|><|end_of_video|>" + + video, metadata = video_assets.np_ndarrays, video_assets.metadata + metadata["fps"] = fps + mm_data = {"video": [(video, metadata)]} + + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) + + # Ensure we have the right number of placeholders per num_crops size + hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) + video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token) + video_tok_count = processed_inputs["prompt_token_ids"].count( + video_token_id) + grid_t, _, _ = processed_inputs["mm_kwargs"]["video_grid_thw"][0] + + assert grid_t == expected_grid_t + assert video_tok_count == expected_toks_per_frame * grid_t diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 5f306f05d140e..7c9840790fe3e 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -937,7 +937,7 @@ class Glm4vProcessingInfo(BaseProcessingInfo): total_frames: int) -> list[int]: video_processor = self.get_video_processor() - video_fps = metadata.get("fps", 2.0) + video_fps = metadata.get("fps", video_processor.fps) meta_frames = metadata.get("total_num_frames", total_frames) max_frame_idx = meta_frames - 1 duration = metadata.get("duration", @@ -1120,11 +1120,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): video_placeholder, ) - grid_t = len(video_outputs["video_grid_thw"]) - _, grid_h, grid_w = video_outputs["video_grid_thw"][0] - grid_thw = torch.tensor([[grid_t, grid_h, grid_w]]) - - video_grid_thw_lst.append(grid_thw) + video_grid_thw_lst.append(video_outputs["video_grid_thw"]) pixel_values_videos_lst.append( video_outputs["pixel_values_videos"]) video_outputs = dict( From b879ecd6e2636b6af893052615693a51466381ec Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Sat, 2 Aug 2025 01:09:36 +0800 Subject: [PATCH 35/54] [Bugfix] fix when skip tokenizer init (#21922) Signed-off-by: rongfu.leng --- tests/v1/engine/test_llm_engine.py | 26 ++++++++++++++++++++++++++ vllm/v1/engine/processor.py | 9 +++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index f37686317fd14..2848420c22085 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -213,3 +213,29 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): assert len(num_accepted_tokens_per_pos) == 1 assert isinstance(num_accepted_tokens_per_pos[0], Vector) assert len(num_accepted_tokens_per_pos[0].values) == 5 + + +@pytest.mark.parametrize("model", ["meta-llama/Llama-3.2-1B-Instruct"]) +def test_skip_tokenizer_initialization(model: str, + monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("VLLM_USE_V1", "1") + # This test checks if the flag skip_tokenizer_init skips the initialization + # of tokenizer and detokenizer. The generated output is expected to contain + # token ids. + llm = LLM( + model=model, + skip_tokenizer_init=True, + enforce_eager=True, + ) + sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) + + with pytest.raises(ValueError, match="cannot pass text prompts when"): + llm.generate("abc", sampling_params) + + outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, + sampling_params=sampling_params) + assert len(outputs) > 0 + completions = outputs[0].outputs + assert len(completions) > 0 + assert completions[0].text == "" + assert completions[0].token_ids diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 0f2f404a130ef..224acc47feb27 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -89,6 +89,10 @@ class Processor: return if not params.allowed_token_ids: raise ValueError("allowed_token_ids is not None and empty!") + if self.tokenizer is None: + # When skip_tokenizer_init=True, we can't validate token IDs + # Skip validation and let the model handle invalid tokens + return tokenizer = self.tokenizer.get_lora_tokenizer(lora_request) vocab_size = len(tokenizer) if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): @@ -283,8 +287,9 @@ class Processor: len(decoder_inputs["prompt_token_ids"])) sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) + if self.tokenizer is not None: + sampling_params.update_from_tokenizer( + self.tokenizer.get_lora_tokenizer(lora_request)) else: pooling_params = params.clone() From d6664664b442cb236f8541a126e4076a5e12c56d Mon Sep 17 00:00:00 2001 From: Huzaifa Sidhpurwala Date: Fri, 1 Aug 2025 21:09:49 +0400 Subject: [PATCH 36/54] security policy: take 1 (#21119) Signed-off-by: Huzaifa Sidhpurwala Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Russell Bryant --- SECURITY.md | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 6053cfb41f35b..4f338557da1a0 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,13 +1,41 @@ # Security Policy -## Reporting a Vulnerability +## Reporting security issues: -If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). +## Issue triage ---- +Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). + +## Threat model Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. + +## Issue severity + +We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories: + +### CRITICAL Severity +Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0. + +### HIGH Severity +Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9 + +### MODERATE Severity +Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9 + +### LOW Severity +Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0 + +## Prenotification policy + +For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues. + +* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release. + +* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis. + +* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included. From ac45c44d98e77f30e47b8fb69134f4635183070d Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Fri, 1 Aug 2025 22:44:38 +0530 Subject: [PATCH 37/54] [Bugfix] [Performance] DeepEPHighThroughput + DeepSeek : Quant before Dispatch (#21837) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- .../layers/fused_moe/deepep_ht_prepare_finalize.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py index 7016ff34c3a85..f6b62254e7b4c 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -144,12 +144,13 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): "apply_router_weight_on_input is only implemented for topk=1") a1 = a1 * topk_weights.to(a1.dtype) - if quant_config.per_act_token_quant: + if quant_config.is_block_quantized: + # Quant and Dispatch a1q, a1q_scale = moe_kernel_quantize_input( a1, a1_scale, quant_dtype=quant_config.quant_dtype, - per_act_token_quant=True, + per_act_token_quant=quant_config.per_act_token_quant, block_shape=quant_config.block_shape, ) if a1q_scale is not None and a1q_scale.numel() == 1: @@ -162,8 +163,10 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): rank_topk_weights=topk_weights, num_experts=num_experts) else: - # DeepEP kernels only support dispatching per-token-quant - # quantization. dispatch in bfloat16. + # Dispatch and Quant + # DeepEP kernels only support dispatching block-quantized + # activation scales. + # Dispatch in bfloat16 (expert_x, _, expert_tokens_meta, expert_topk_ids, expert_topk_weights) = self._do_dispatch( tokens=a1, @@ -171,7 +174,7 @@ class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): rank_topk_ids=topk_ids, rank_topk_weights=topk_weights, num_experts=num_experts) - # quantize now + # Quantize after dispatch. expert_x_scale = None if expert_x.numel() != 0: expert_x, expert_x_scale = moe_kernel_quantize_input( From 38c8bce8b652df87d111c04ddf849c38615000c7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 1 Aug 2025 18:31:29 +0100 Subject: [PATCH 38/54] Enable headless models for pooling in the Transformers backend (#21767) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 1 + tests/models/test_transformers.py | 28 +++++++++++++++++----- vllm/config.py | 9 +++++-- vllm/model_executor/models/registry.py | 3 ++- vllm/model_executor/models/transformers.py | 12 ++++++++++ 5 files changed, 44 insertions(+), 9 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 806342a57dfab..fdc7888c85efb 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -525,6 +525,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { } _TRANSFORMERS_BACKEND_MODELS = { + "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"), "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), } diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 5b7d90dfb896d..66ff8f7a54d31 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -34,8 +34,7 @@ def check_implementation( with runner_test(model, **kwargs_test, **kwargs) as model_test: model_config = model_test.llm.llm_engine.model_config - assert model_config.architecture == ( - model_config._get_transformers_backend_cls()) + assert model_config.using_transformers_backend() outputs_test = model_test.generate_greedy_logprobs(*args) @@ -135,8 +134,7 @@ def test_quantization( enforce_eager=True, **quantization_kwargs) as vllm_model: # type: ignore[arg-type] model_config = vllm_model.llm.llm_engine.model_config - assert model_config.architecture == ( - model_config._get_transformers_backend_cls()) + assert model_config.using_transformers_backend() transformers_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs) @@ -149,6 +147,25 @@ def test_quantization( ) +@pytest.mark.parametrize( + "model", + [ + # Layers live in `layers` + "Qwen/Qwen3-Embedding-0.6B", + # Layers live in `model.layers` + "meta-llama/Llama-3.2-1B-Instruct" + ], +) +def test_embed_loading(vllm_runner, model): + with vllm_runner(model, + max_model_len=1024, + enforce_eager=True, + runner="pooling", + model_impl="transformers") as model_test: + model_config = model_test.llm.llm_engine.model_config + assert model_config.using_transformers_backend() + + @pytest.mark.parametrize( "model", ["jason9693/Qwen2.5-1.5B-apeach"], @@ -169,8 +186,7 @@ def test_classify( dtype=dtype, model_impl="transformers") as vllm_model: model_config = vllm_model.llm.llm_engine.model_config - assert model_config.architecture == ( - model_config._get_transformers_backend_cls()) + assert model_config.using_transformers_backend() vllm_outputs = vllm_model.classify(example_prompts) diff --git a/vllm/config.py b/vllm/config.py index 124d62b699771..dabb4b524dfd8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -812,12 +812,17 @@ class ModelConfig: def _get_transformers_backend_cls(self) -> str: """Determine which Transformers backend class will be used if `model_impl` is set to `transformers` or `auto`.""" + if getattr(self, "runner_type", self.runner) == "pooling": + return "TransformersModel" if self.hf_config != self.hf_text_config: # If 'hf_text_config' is the same as 'hf_config'. If not, it is # probably a composite config, i.e. multimodal return "TransformersForMultimodalLM" - else: - return "TransformersForCausalLM" + return "TransformersForCausalLM" + + def using_transformers_backend(self) -> bool: + """Check if the model is using the Transformers backend class.""" + return self.architecture == self._get_transformers_backend_cls() @property def registry(self): diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 848c04b9b32f7..0c5d87a7dc472 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -270,8 +270,9 @@ _TRANSFORMERS_SUPPORTED_MODELS = { } _TRANSFORMERS_BACKEND_MODELS = { - "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 + "TransformersModel": ("transformers", "TransformersModel"), "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), + "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 } # yapf: enable diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index e67548800c354..5059d1e1d9fea 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -651,6 +651,18 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) +@support_torch_compile +class TransformersModel(TransformersBase): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # Add `model.` prefix for base model checkpoints + "": "model.", + # Remove `model.` from places it should not be + "model.model.": "model.", + "model.score": "score", + }) + + @support_torch_compile class TransformersForCausalLM(TransformersBase): From 8d705996dffbb2299750b7b2b50bbcd5ccb4a5ad Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 2 Aug 2025 01:35:30 +0800 Subject: [PATCH 39/54] [Misc] Minor enhancement of benchmark_moe (#22068) Signed-off-by: Jee Jee Li --- benchmarks/kernels/benchmark_moe.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c350aaf5d3ad2..72250e2fb6d2b 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -22,6 +22,13 @@ from vllm.utils import FlexibleArgumentParser FP8_DTYPE = current_platform.fp8_dtype() +def ensure_divisibility(numerator, denominator): + """Ensure that numerator is divisible by the denominator.""" + assert numerator % denominator == 0, ( + "intermediate_size {} is not divisible by tp {}.".format(numerator, denominator) + ) + + class BenchmarkConfig(TypedDict): BLOCK_SIZE_M: int BLOCK_SIZE_N: int @@ -603,7 +610,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size - + ensure_divisibility(intermediate_size, args.tp_size) hidden_size = config.hidden_size dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" From 3277e8f9e19c396d6dd92a0901d2e3f8fb8982d4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 1 Aug 2025 13:36:07 -0400 Subject: [PATCH 40/54] Fix pre-commit failure for SECURTIY.md (#22102) Signed-off-by: mgoin --- SECURITY.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/SECURITY.md b/SECURITY.md index 4f338557da1a0..414669fb3712e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,6 +1,6 @@ # Security Policy -## Reporting security issues: +## Reporting security issues Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). @@ -19,15 +19,19 @@ Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/m We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories: ### CRITICAL Severity + Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥ 9.0. ### HIGH Severity + Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9 ### MODERATE Severity + Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9 ### LOW Severity + Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0 ## Prenotification policy From 9659bc7f271ec640da780b5ca739e261764b954b Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Fri, 1 Aug 2025 10:38:52 -0700 Subject: [PATCH 41/54] [compile][startup] Disable C++ compilation of symbolic shapes (#20836) Signed-off-by: Animesh Jain --- vllm/compilation/decorators.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 1370862d580a5..0d2c432497c40 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -267,8 +267,15 @@ def _support_torch_compile( code.co_filename) return inline_call(parent, func, args, kwargs) - with patch.object(InliningInstructionTranslator, 'inline_call', - patched_inline_call): + # Disable the C++ compilation of symbolic shape guards. C++-fication + # of symbolic shape guards can improve guard overhead. But, since + # vllm skip guards anyways, setting this flag to False can improve + # compile time. + with torch._dynamo.config.patch("enable_cpp_symbolic_shape_guards", + False), patch.object( + InliningInstructionTranslator, + 'inline_call', + patched_inline_call): output = self.compiled_callable(*args, **kwargs) return output From d331759488eb7627d2454549eeb01d14f83f1c41 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 1 Aug 2025 11:50:58 -0700 Subject: [PATCH 42/54] Introduce RayPPCommunicator for ray-based PP (#21660) Signed-off-by: Rui Qiao --- .../device_communicators/ray_communicator.py | 257 ++++++++++++++++++ vllm/envs.py | 8 + vllm/executor/ray_distributed_executor.py | 15 + 3 files changed, 280 insertions(+) create mode 100644 vllm/distributed/device_communicators/ray_communicator.py diff --git a/vllm/distributed/device_communicators/ray_communicator.py b/vllm/distributed/device_communicators/ray_communicator.py new file mode 100644 index 0000000000000..e5ba297ebcc1b --- /dev/null +++ b/vllm/distributed/device_communicators/ray_communicator.py @@ -0,0 +1,257 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import uuid +from typing import Any, Optional + +import ray +import torch +from ray.exceptions import RayChannelError +from ray.experimental.channel.communicator import (Communicator, + TorchTensorAllocator) +from torch.distributed import ReduceOp + +from vllm.distributed.device_communicators.base_device_communicator import ( + DeviceCommunicatorBase) +from vllm.distributed.parallel_state import get_pp_group +from vllm.logger import init_logger +from vllm.utils import current_stream + +logger = init_logger(__name__) + + +class RayPPCommunicator(Communicator): + """ + Communicator to be used for pipeline parallelism in Ray Compiled Graph. + This is wraps around the vLLM _PP GroupCoordinator. + + This class is not thread-safe. + """ + + _comm: Optional[DeviceCommunicatorBase] + + def __init__( + self, + world_size: int, + comm_id: Any, + rank: Optional[int], + actor_handles: list["ray.actor.ActorHandle"], + cuda_stream: Optional[torch.cuda.Stream], + use_communication_streams: bool = False, + ): + """ + Initialize a RayPPCommunicator that can be used to communicate with + other Ray Compiled Graph actors for pipeline parallelism. + + Args: + world_size: The number of participating actors. + comm_id: A unique communicator ID. This is just to conform with + the Ray Communicator API and is not used. + rank: The rank of this actor. If None, then the caller is not a + participant of the RayPPCommunicator group (e.g., the Ray + driver). + actor_handles: A list of actor handles. + cuda_stream: A CUDA stream to dispatch communication ops to. This + is not supported. + use_communication_streams: Whether to use communication streams. + This is not supported. + """ + self._world_size = world_size + self._rank: Optional[int] = None + self._actor_handles = actor_handles + if use_communication_streams: + raise NotImplementedError( + "use_communication_streams is not supported") + if cuda_stream is not None and cuda_stream != current_stream(): + raise ValueError( + "cuda_stream other than the current stream is not supported") + + if rank is not None: + # Rank is not None, this is Ray worker + assert ray.get_gpu_ids(), "RayPPCommunicator has no GPUs assigned" + + self._comm = get_pp_group().device_communicator + + # Since we wrap around the vLLM _PP communicator, we use + # the rank from the vLLM communicator, and ignore the rank + # passed in from Ray. + # TODO(rui): refactor the Ray Communicator API so that + # it also supports no rank passed in. + self._rank = self._comm.rank_in_group + + self._build_actor_rank_mapping() + else: + # Rank is None, this is Ray driver + self._comm = None + + self._closed = False + + def _build_actor_rank_mapping(self): + """ + Use collective communication to build a mapping from actor IDs to ranks. + This should be called once during initialization. + """ + if self._comm is None: + return {} + + current_actor = ray.get_runtime_context().current_actor + actor_id_str = current_actor._actor_id.hex() + + # Ray actor IDs are 32-character hex strings (128 bits) + ACTOR_ID_LEN = 32 + actor_id_bytes = actor_id_str.encode('utf-8') + assert len( + actor_id_bytes + ) == ACTOR_ID_LEN, f"Unexpected actor ID length: {len(actor_id_bytes)}" + + actor_id_tensor = torch.frombuffer( + actor_id_bytes, dtype=torch.uint8).to(self._comm.device) + + # All-gather full actor IDs from all actors + gathered_ids = self._comm.all_gather(actor_id_tensor, dim=0) + + # Build mapping: actor_id -> device_comm_rank + self._actor_id_to_rank = {} + for rank in range(self._world_size): + start_idx = rank * ACTOR_ID_LEN + end_idx = (rank + 1) * ACTOR_ID_LEN + actor_bytes = gathered_ids[start_idx:end_idx].cpu().numpy( + ).tobytes() + actor_id = actor_bytes.decode('utf-8') + self._actor_id_to_rank[actor_id] = rank + + def initialize(self, rank: int) -> None: + # No additional initialization is needed. + pass + + def get_actor_handles(self) -> list["ray.actor.ActorHandle"]: + return self._actor_handles + + def get_rank(self, actor: ray.actor.ActorHandle) -> int: + """ + Return the given actor's rank using device communicator collective ops. + """ + assert hasattr(self, '_actor_id_to_rank'), ( + "Actor rank mapping not built. " + "This should have been done during initialization.") + + actor_id_str = actor._actor_id.hex() + + if actor_id_str in self._actor_id_to_rank: + return self._actor_id_to_rank[actor_id_str] # type: ignore + else: + raise ValueError(f"Actor {actor} not found in communicator group") + + def get_self_rank(self) -> Optional[int]: + """ + Return this actor's rank. + """ + return self._rank + + def get_world_size(self) -> int: + """ + Return the number of ranks in the RayPPCommunicator group. + """ + return self._world_size + + def send(self, buf: "torch.Tensor", peer_rank: int) -> None: + """ + Send a torch.Tensor to a peer. + + This returns when the send kernel has been queued, but the kernel may + not have completed. Therefore, the caller should ensure that there are + no concurrent writes to the sent `buf` until the send has finished. + That is, either all writes should be submitted on the current stream + (self._cuda_stream) or, if on a different stream, that stream should + synchronize with the current stream. + + Args: + buf: The torch.Tensor to send. It should already be on this + actor's default device. + peer_rank: The rank of the actor to send to. + """ + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + + assert self._comm is not None + self._comm.send(buf, peer_rank) + + def recv( + self, + shape: tuple[int], + dtype: "torch.dtype", + peer_rank: int, + allocator: TorchTensorAllocator, + ) -> "torch.Tensor": + """ + Receive a torch.Tensor from a peer and synchronize the current stream. + + After this call returns, the receive buffer is safe to read from from + any stream. An RayChannelError will be raised if an error occurred + (e.g., remote actor died), and the buffer is not safe to read. + + Args: + shape: The shape of the tensor to receive. + dtype: The dtype of the tensor to receive. + peer_rank: The rank of the actor to receive from. + allocator: The allocator to use to create the received tensor. + This is ignored for this implementation. + """ + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + + assert self._comm is not None + size = torch.Size(shape) + buf = self._comm.recv(size, dtype, src=peer_rank) + + # Buffer values are undefined if NCCL ops are aborted. Therefore, we + # need to synchronize here and check that the channel is still + # open to ensure that the receive buffer is valid. + # TODO(swang): Avoid CUDA synchronization. + current_stream().synchronize() + + if self._closed: + raise RayChannelError("RayPPCommunicator has been destroyed.") + return buf + + def allgather( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + ): + raise NotImplementedError("allgather is not supported") + + def allreduce( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + op: ReduceOp = ReduceOp.SUM, + ): + raise NotImplementedError("allreduce is not supported") + + def reducescatter( + self, + send_buf: "torch.Tensor", + recv_buf: "torch.Tensor", + op: ReduceOp = ReduceOp.SUM, + ): + raise NotImplementedError("reducescatter is not supported") + + @property + def recv_stream(self): + return torch.cuda.StreamContext(current_stream()) + + @property + def send_stream(self): + return torch.cuda.StreamContext(current_stream()) + + def destroy(self) -> None: + # Just sets a flag, vLLM manages the lifecycle of the underlying + # _PP GroupCoordinator. + self._closed = True + + def get_transport_name(self) -> str: + return "nccl" + + @classmethod + def generate_communicator_id(cls) -> Any: + return uuid.uuid4() diff --git a/vllm/envs.py b/vllm/envs.py index 7553eccf16ea9..2fda2903179b5 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -55,6 +55,7 @@ if TYPE_CHECKING: VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False + VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True VLLM_XLA_USE_SPMD: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") @@ -498,6 +499,13 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0")) ), + # If the env var is set, it uses a Ray Communicator wrapping + # vLLM's pipeline parallelism communicator to interact with Ray's + # Compiled Graph. Otherwise, it uses Ray's NCCL communicator. + # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set. + "VLLM_USE_RAY_WRAPPED_PP_COMM": + lambda: bool(int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))), + # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index e9ad62aeb99a8..37c3fe59c65dd 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -608,6 +608,21 @@ class RayDistributedExecutor(DistributedExecutorBase): forward_dag = MultiOutputNode(outputs) + if envs.VLLM_USE_RAY_WRAPPED_PP_COMM: + from ray.experimental.channel.accelerator_context import ( + register_accelerator_context) + + from vllm.distributed.device_communicators.ray_communicator import ( + RayPPCommunicator) + register_accelerator_context(torch_module_name="cuda", + communicator_cls=RayPPCommunicator) + logger.info("Using RayPPCommunicator " + "(which wraps vLLM _PP GroupCoordinator) " + "for Ray Compiled Graph communication.") + else: + logger.info("Using Ray's NCCL communicator for " + "Ray Compiled Graph communication.") + return forward_dag.experimental_compile( enable_asyncio=enable_asyncio, _overlap_gpu_communication=envs. From d84b97a3e33ed79aaba7552bfe5889d363875562 Mon Sep 17 00:00:00 2001 From: XiongfeiWei Date: Fri, 1 Aug 2025 11:56:08 -0700 Subject: [PATCH 43/54] Add lora test for tp>1 case for TPU. (#21970) Signed-off-by: Xiongfei Wei --- tests/tpu/lora/test_lora.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py index b26bdd34d890e..4c47b8c43caff 100644 --- a/tests/tpu/lora/test_lora.py +++ b/tests/tpu/lora/test_lora.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from torch_xla._internal import tpu import vllm from vllm.lora.request import LoRARequest @@ -27,25 +28,31 @@ def use_v1_only(monkeypatch: pytest.MonkeyPatch): yield -def setup_vllm(num_loras: int) -> vllm.LLM: +def setup_vllm(num_loras: int, tp: int) -> vllm.LLM: return vllm.LLM(model="Qwen/Qwen2.5-3B-Instruct", num_scheduler_steps=1, max_model_len=256, max_seq_len_to_capture=256, max_num_seqs=8, + tensor_parallel_size=tp, enable_lora=True, max_loras=num_loras, max_lora_rank=8) -def test_single_lora(): +TPU_TENSOR_PARALLEL_SIZES = [1, tpu.num_available_chips() + ] if tpu.num_available_chips() > 1 else [1] + + +@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) +def test_single_lora(tp: int): """ This test ensures we can run a single LoRA adapter on the TPU backend. We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which will force Qwen2.5-3B-Instruct to claim 1+1=1. """ - llm = setup_vllm(1) + llm = setup_vllm(1, tp) prompt = "What is 1+1? \n" @@ -63,7 +70,8 @@ def test_single_lora(): assert int(answer) == 1 -def test_lora_hotswapping(): +@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) +def test_lora_hotswapping(tp: int): """ This test ensures we can run multiple LoRA adapters on the TPU backend, even if we only have space to store 1. @@ -79,7 +87,7 @@ def test_lora_hotswapping(): for i in range(1, 5) ] - llm = setup_vllm(1) + llm = setup_vllm(1, tp) prompt = "What is 1+1? \n" @@ -94,7 +102,8 @@ def test_lora_hotswapping(): assert int(answer) == i + 1 -def test_multi_lora(): +@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES) +def test_multi_lora(tp: int): """ This test ensures we can run multiple LoRA adapters on the TPU backend, when we have enough space to store all of them. @@ -109,7 +118,7 @@ def test_multi_lora(): for i in range(1, 5) ] - llm = setup_vllm(4) + llm = setup_vllm(4, tp) prompt = "What is 1+1? \n" From 881e1af43a1bb7b4bedd373e413eb7ad9dc9f920 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 1 Aug 2025 22:40:45 +0100 Subject: [PATCH 44/54] [BugFix] Harden distributed DP startup (#21538) Signed-off-by: Nick Hill --- vllm/utils/__init__.py | 3 ++ vllm/v1/engine/coordinator.py | 12 +++++++ vllm/v1/engine/core.py | 61 +++++++++++++++++++++++------------ 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 7405f3986df8d..0d3fa6b059beb 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -2794,6 +2794,9 @@ def make_zmq_socket( if linger is not None: socket.setsockopt(zmq.LINGER, linger) + if socket_type == zmq.XPUB: + socket.setsockopt(zmq.XPUB_VERBOSE, True) + # Determine if the path is a TCP socket with an IPv6 address. # Enable IPv6 on the zmq socket if so. scheme, host, _ = split_zmq_path(path) diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index 440628576bcb7..8d8d1689e61e3 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -172,6 +172,18 @@ class DPCoordinatorProc: bind=True, ) as publish_back: + # Wait until all engines subscribe. + for _ in self.engines: + if publish_back.recv() != b'\x01': + logger.error( + "DP Coordinator received unexpected message while " + "waiting for engines to subscribe") + return + # Send ready message to engines. + publish_back.send(b"READY") + + logger.info("All engine subscriptions received by DP coordinator") + poller = zmq.Poller() poller.register(publish_front, zmq.POLLIN) poller.register(output_back, zmq.POLLIN) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 6ae5736df98b8..0a889b2a0a184 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -461,8 +461,11 @@ class EngineCoreProc(EngineCore): self.has_coordinator = addresses.coordinator_output is not None self.frontend_stats_publish_address = ( addresses.frontend_stats_publish_address) + logger.debug("Has DP Coordinator: %s, stats publish address: %s", + self.has_coordinator, + self.frontend_stats_publish_address) # Only publish request queue stats to coordinator for "internal" - # LB mode. + # and "hybrid" LB modes . self.publish_dp_lb_stats = ( self.has_coordinator and not vllm_config.parallel_config.data_parallel_external_lb) @@ -472,25 +475,38 @@ class EngineCoreProc(EngineCore): super().__init__(vllm_config, executor_class, log_stats, executor_fail_callback) + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + ready_event = threading.Event() + input_thread = threading.Thread(target=self.process_input_sockets, + args=(addresses.inputs, + addresses.coordinator_input, + identity, ready_event), + daemon=True) + input_thread.start() + + self.output_thread = threading.Thread( + target=self.process_output_sockets, + args=(addresses.outputs, addresses.coordinator_output, + self.engine_index), + daemon=True) + self.output_thread.start() + + # Don't complete handshake until DP coordinator ready message is + # received. + while not ready_event.wait(timeout=10): + if not input_thread.is_alive(): + raise RuntimeError( + "Input socket thread died during startup") + assert addresses.coordinator_input is not None + logger.info("Waiting for READY message from DP Coordinator...") + self.step_fn = (self.step if self.batch_queue is None else self.step_with_batch_queue) - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - threading.Thread(target=self.process_input_sockets, - args=(addresses.inputs, addresses.coordinator_input, - identity), - daemon=True).start() - self.output_thread = threading.Thread( - target=self.process_output_sockets, - args=(addresses.outputs, addresses.coordinator_output, - self.engine_index), - daemon=True) - self.output_thread.start() - @contextmanager def _perform_handshakes( self, @@ -505,10 +521,10 @@ class EngineCoreProc(EngineCore): For DP=1 or offline mode, this is with the colocated front-end process. - For DP>1 with internal loadbalancing this is with the shared front-end + For DP>1 with internal load-balancing this is with the shared front-end process which may reside on a different node. - For DP>1 with external or hybrid loadbalancing, two handshakes are + For DP>1 with external or hybrid load-balancing, two handshakes are performed: - With the rank 0 front-end process which retrieves the DP Coordinator ZMQ addresses and DP process group address. @@ -772,7 +788,7 @@ class EngineCoreProc(EngineCore): def process_input_sockets(self, input_addresses: list[str], coord_input_address: Optional[str], - identity: bytes): + identity: bytes, ready_event: threading.Event): """Input socket IO thread.""" # Msgpack serialization decoding. @@ -809,9 +825,14 @@ class EngineCoreProc(EngineCore): # back to us. input_socket.send(b'') poller.register(input_socket, zmq.POLLIN) + if coord_socket is not None: + # Wait for ready message from coordinator. + assert coord_socket.recv() == b"READY" poller.register(coord_socket, zmq.POLLIN) + ready_event.set() + del ready_event while True: for input_socket, _ in poller.poll(): # (RequestType, RequestData) From 88faa466d788e25082c02dc9688931d7976361f9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 1 Aug 2025 19:18:38 -0400 Subject: [PATCH 45/54] [CI] Initial tests for SM100 Blackwell runner (#21877) Signed-off-by: mgoin --- .buildkite/test-pipeline.yaml | 24 ++++++++++++++++--- tests/compile/test_fusion_all_reduce.py | 15 +++++++----- .../quantization/test_cutlass_scaled_mm.py | 5 ---- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 598fd5762985e..cc1223d4c4653 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -647,13 +647,31 @@ steps: - label: Blackwell Test working_dir: "/vllm-workspace/" gpu: b200 - optional: true + # optional: true source_file_dependencies: - - csrc/ - - vllm/ + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/fusion.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_decode_attention.py + - pytest -v -s tests/kernels/test_cutlass_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + # Fusion + - pytest -v -s tests/compile/test_fusion_all_reduce.py ##### 1 GPU test ##### ##### multi gpus test ##### diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/test_fusion_all_reduce.py index b394e0035c689..4c3cf6c2a10cf 100644 --- a/tests/compile/test_fusion_all_reduce.py +++ b/tests/compile/test_fusion_all_reduce.py @@ -136,12 +136,15 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): @multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("test_model", [ - TestAllReduceRMSNormModel, - TestAllReduceFusedAddRMSNormModel, - TestAllReduceFusedAddRMSNormStaticQuantFP8Model, - TestAllReduceFusedAddRMSNormStaticQuantFP4Model, -]) +@pytest.mark.parametrize( + "test_model", + [ + TestAllReduceRMSNormModel, + TestAllReduceFusedAddRMSNormModel, + TestAllReduceFusedAddRMSNormStaticQuantFP8Model, + # TODO: Enable with torch==2.8.0 + # TestAllReduceFusedAddRMSNormStaticQuantFP4Model, + ]) @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seq_len", [8]) @pytest.mark.parametrize("hidden_size", [16]) diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 544e6dc197904..8730eeaaa761c 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -559,8 +559,6 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, m_a_scales = m_g if per_act_token else 1 n_b_scales = n_g if per_out_ch else 1 - print("shape:", m_g, n_g, k_g) - # Create group-specific A and B (FP8) and output (FP16/FP32) a_g = to_fp8(torch.randn((m_g, k_g), device=device)) b_g = to_fp8(torch.randn((n_g, k_g), device=device).t()) @@ -639,7 +637,4 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool, for g in range(num_experts): baseline = baseline_tensors[g] c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]] - print(baseline) - print(c) - print("*") torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4) From eefbf4a68b7b0a5b8364a59647906be1b7f043e2 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 1 Aug 2025 19:18:51 -0400 Subject: [PATCH 46/54] [Perf] Optimize `reshape_and_cache_flash` CUDA Kernel (#22036) Signed-off-by: yewentao256 --- .../benchmark_reshape_and_cache_flash.py | 156 ++++++++++++++++++ csrc/cache_kernels.cu | 92 ++++++++--- 2 files changed, 225 insertions(+), 23 deletions(-) create mode 100644 benchmarks/kernels/benchmark_reshape_and_cache_flash.py diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py new file mode 100644 index 0000000000000..d4648c18f31d5 --- /dev/null +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +import random +import time + +import torch +from tabulate import tabulate + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + FlexibleArgumentParser, + create_kv_caches_with_random_flash, +) + +logger = init_logger(__name__) + + +@torch.inference_mode() +def run_benchmark( + num_tokens: int, + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + kv_cache_dtype: str, + kv_cache_layout: str, + num_iters: int, + device: str = "cuda", +) -> float: + """Return latency (seconds) for given num_tokens.""" + + if kv_cache_dtype == "fp8" and head_size % 16: + raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") + + current_platform.seed_everything(42) + torch.set_default_device(device) + + # create random key / value tensors [T, H, D]. + key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) + value = torch.randn_like(key) + + # prepare the slot mapping. + # each token is assigned a unique slot in the KV-cache. + num_slots = block_size * num_blocks + if num_tokens > num_slots: + raise ValueError("num_tokens cannot exceed the total number of cache slots") + slot_mapping_lst = random.sample(range(num_slots), num_tokens) + slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) + + key_caches, value_caches = create_kv_caches_with_random_flash( + num_blocks, + block_size, + 1, # num_layers + num_heads, + head_size, + kv_cache_dtype, + dtype, + device=device, + cache_layout=kv_cache_layout, + ) + key_cache, value_cache = key_caches[0], value_caches[0] + + # compute per-kernel scaling factors for fp8 conversion (if used). + k_scale = (key.amax() / 64.0).to(torch.float32) + v_scale = (value.amax() / 64.0).to(torch.float32) + + def run_cuda_benchmark(n_iters: int) -> float: + nonlocal key, value, key_cache, value_cache, slot_mapping + torch.cuda.synchronize() + start = time.perf_counter() + for _ in range(n_iters): + ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + slot_mapping, + kv_cache_dtype, + k_scale, + v_scale, + ) + torch.cuda.synchronize() + end = time.perf_counter() + return (end - start) / n_iters + + # warm-up + run_cuda_benchmark(3) + + lat = run_cuda_benchmark(num_iters) + + # free tensors to mitigate OOM when sweeping + del key, value, key_cache, value_cache, slot_mapping + torch.cuda.empty_cache() + + return lat + + +def main(args): + rows = [] + for layout in ["NHD", "HND"]: + for exp in range(1, 17): + n_tok = 2**exp + lat = run_benchmark( + num_tokens=n_tok, + num_heads=args.num_heads, + head_size=args.head_size, + block_size=args.block_size, + num_blocks=args.num_blocks, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + kv_cache_dtype=args.kv_cache_dtype, + kv_cache_layout=layout, + num_iters=args.iters, + device="cuda", + ) + rows.append([n_tok, layout, f"{lat * 1e6:.3f}"]) + + print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"])) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + + parser.add_argument("--num-heads", type=int, default=128) + parser.add_argument( + "--head-size", + type=int, + choices=[64, 80, 96, 112, 120, 128, 192, 256], + default=128, + ) + parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) + parser.add_argument("--num-blocks", type=int, default=128 * 512) + + parser.add_argument( + "--dtype", + type=str, + choices=["half", "bfloat16", "float"], + default="bfloat16", + ) + + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=["auto", "fp8"], + default="auto", + ) + + parser.add_argument("--iters", type=int, default=100) + args = parser.parse_args() + + main(args) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 88559c8fe7183..131dcb15cd7e9 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -5,6 +5,7 @@ #include "cuda_utils.h" #include "cuda_compat.h" #include "dispatch_utils.h" +#include "quantization/vectorization_utils.cuh" #ifdef USE_ROCM #include "quantization/fp8/amd/quant_utils.cuh" @@ -261,14 +262,26 @@ __global__ void reshape_and_cache_kernel( } } +// Used by vectorization_utils to copy/convert one element +template +struct CopyWithScaleOp { + float scale; + + __device__ __forceinline__ void operator()(OutT& dst, const InT src) const { + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + dst = static_cast(src); + } else { + dst = fp8::scaled_convert(src, scale); + } + } +}; + template __global__ void reshape_and_cache_flash_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads, - // head_size] - cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads, - // head_size] + cache_t* __restrict__ key_cache, // NHD or HND, shape see comments below + cache_t* __restrict__ value_cache, // same above const int64_t* __restrict__ slot_mapping, // [num_tokens] const int64_t block_stride, const int64_t page_stride, const int64_t head_stride, const int64_t key_stride, @@ -282,25 +295,58 @@ __global__ void reshape_and_cache_flash_kernel( } const int64_t block_idx = slot_idx / block_size; const int64_t block_offset = slot_idx % block_size; - const int n = num_heads * head_size; - for (int i = threadIdx.x; i < n; i += blockDim.x) { - const int64_t src_key_idx = token_idx * key_stride + i; - const int64_t src_value_idx = token_idx * value_stride + i; - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int64_t tgt_key_value_idx = block_idx * block_stride + - block_offset * page_stride + - head_idx * head_stride + head_offset; - scalar_t tgt_key = key[src_key_idx]; - scalar_t tgt_value = value[src_value_idx]; - if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - key_cache[tgt_key_value_idx] = tgt_key; - value_cache[tgt_key_value_idx] = tgt_value; - } else { - key_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_key, *k_scale); - value_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_value, *v_scale); + const int n_elems = num_heads * head_size; + + // pointers to the beginning of the source row for this token. + const scalar_t* __restrict__ key_src = key + token_idx * key_stride; + const scalar_t* __restrict__ value_src = value + token_idx * value_stride; + + // find the start position inside the kv-cache for this token. + cache_t* __restrict__ key_dst = + key_cache + block_idx * block_stride + block_offset * page_stride; + cache_t* __restrict__ value_dst = + value_cache + block_idx * block_stride + block_offset * page_stride; + + // this is true for the NHD layout where `head_stride == head_size` + const bool is_contiguous_heads = (head_stride == head_size); + + float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale; + float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale; + constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; + CopyWithScaleOp k_op{k_scale_val}; + CopyWithScaleOp v_op{v_scale_val}; + if (is_contiguous_heads) { + // NHD layout + // kv cache: [num_blocks, block_size, num_heads, head_size] + vectorize_with_alignment(key_src, key_dst, n_elems, threadIdx.x, + blockDim.x, k_op); + + vectorize_with_alignment(value_src, value_dst, n_elems, + threadIdx.x, blockDim.x, v_op); + + } else { + // HND layout: heads are strided, but each head_size segment is contiguous + // kv cache: [num_blocks, num_heads, block_size, head_size] + const int lane = threadIdx.x & 31; // 0..31 within warp + const int warp_id = threadIdx.x >> 5; // warp index within block + const int warps_per_block = blockDim.x >> 5; + + for (int head = warp_id; head < num_heads; head += warps_per_block) { + const scalar_t* __restrict__ k_src_h = key_src + head * head_size; + const scalar_t* __restrict__ v_src_h = value_src + head * head_size; + + cache_t* __restrict__ k_dst_h = + key_dst + static_cast(head) * head_stride; + cache_t* __restrict__ v_dst_h = + value_dst + static_cast(head) * head_stride; + + // within each head, let the 32 threads of the warp perform the vector + // copy + vectorize_with_alignment(k_src_h, k_dst_h, head_size, lane, 32, + k_op); + + vectorize_with_alignment(v_src_h, v_dst_h, head_size, lane, 32, + v_op); } } } From 3654847db5a9b9a0955f8416292d94fa1c827f77 Mon Sep 17 00:00:00 2001 From: JartX Date: Sat, 2 Aug 2025 03:12:19 +0200 Subject: [PATCH 47/54] feat: Add Support GPTQ Quantization MOE on ROCM vllm serve (#21733) --- .../layers/fused_moe/fused_moe.py | 4 ++-- .../layers/quantization/gptq.py | 22 ++++++++++++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index b69575c7e96de..56d1dfe135b3b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -761,8 +761,8 @@ def get_moe_wna16_block_config(config: dict[str, def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int, num_experts: int, bit: int): - return bit == 4 and group_size in [32, 64, 128] and \ - num_valid_tokens / num_experts <= 6 + return current_platform.is_cuda() and bit == 4 and \ + group_size in [32, 64, 128] and num_valid_tokens / num_experts <= 6 def get_default_config( diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index d3ab1be3bee01..f18c936bac605 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -10,10 +10,11 @@ import torch from torch.nn.parameter import Parameter from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.linear import LinearMethodBase from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils.gptq_utils import ( get_linear_quant_method) from vllm.model_executor.parameter import (ChannelQuantScaleParameter, @@ -110,8 +111,23 @@ class GPTQConfig(QuantizationConfig): return cls(weight_bits, group_size, desc_act, lm_head_quantized, dynamic) - def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["GPTQLinearMethod"]: + def get_quant_method( + self, layer: torch.nn.Module, prefix: str + ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]: + if isinstance(layer, FusedMoE): + # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility + from .moe_wna16 import MoeWNA16Config + + config = { + "quant_method": "gptq", + "bits": self.weight_bits, + "group_size": self.group_size, + "sym": True, # GPTQ typically uses symmetric quantization + "lm_head": False, + } + return MoeWNA16Config.from_config(config).get_quant_method( + layer, prefix) + return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod) From 23322431c802bb1057426c7ca31b22e859b51644 Mon Sep 17 00:00:00 2001 From: fhl2000 <63384265+fhl2000@users.noreply.github.com> Date: Sat, 2 Aug 2025 09:49:34 +0800 Subject: [PATCH 48/54] [V1][CUDA] Full cudagraph support for FlashInfer (#21367) --- vllm/v1/attention/backends/flash_attn.py | 7 +- vllm/v1/attention/backends/flashinfer.py | 355 ++++++++++++++++-- vllm/v1/attention/backends/mla/flashmla.py | 4 +- .../attention/backends/mla/rocm_aiter_mla.py | 4 +- vllm/v1/attention/backends/triton_attn.py | 6 +- vllm/v1/attention/backends/utils.py | 18 +- vllm/v1/worker/gpu_model_runner.py | 24 +- vllm/v1/worker/gpu_worker.py | 5 + 8 files changed, 376 insertions(+), 47 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 3f9afa67aef70..f086bab2556eb 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -25,7 +25,8 @@ if is_flash_attn_varlen_func_available(): from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.utils import cdiv -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout) from vllm.v1.kv_cache_interface import AttentionSpec @@ -153,7 +154,9 @@ def _get_sliding_window_configs( class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): - full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3 + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.NEVER if get_flash_attn_version() == 2 \ + else AttentionCGSupport.ALWAYS def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index f8af1d7e41831..0aaad02b5b840 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -4,26 +4,28 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, ClassVar, Optional, Union import torch from flashinfer import (BatchDecodeWithPagedKVCacheWrapper, BatchPrefillWithPagedKVCacheWrapper, MultiLevelCascadeAttentionWrapper) -from flashinfer.decode import trtllm_batch_decode_with_kv_cache +from flashinfer.decode import (_get_range_buf, get_seq_lens, + trtllm_batch_decode_with_kv_cache) import vllm.envs as envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionType) from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import cdiv +from vllm.utils import cdiv, is_pin_memory_available from vllm.utils.flashinfer import use_trtllm_decode_attention from vllm.v1.attention.backends.flash_attn import use_cascade_attention from vllm.v1.attention.backends.utils import ( - AttentionMetadataBuilder, CommonAttentionMetadata, get_kv_cache_layout, - get_per_layer_parameters, infer_global_hyperparameters, - reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills) + AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, + get_kv_cache_layout, get_per_layer_parameters, + infer_global_hyperparameters, reorder_batch_to_split_decodes_and_prefills, + split_decodes_and_prefills) from vllm.v1.kv_cache_interface import AttentionSpec if TYPE_CHECKING: @@ -174,26 +176,66 @@ class FlashInferMetadata: class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.PURE_DECODE_ONLY def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): self.device = device + self.vllm_config = vllm_config + self.cache_config = vllm_config.cache_config + self.kv_cache_spec = kv_cache_spec self._workspace_buffer = None self._prefill_wrapper = None # Wrapper for prefill/append - self._decode_wrapper = None # Wrapper for decode + self._decode_wrapper = None # Wrapper for decode (general shape) + + self.compilation_config = vllm_config.compilation_config + max_num_pages_per_req = cdiv(vllm_config.model_config.max_model_len, + self.kv_cache_spec.block_size) + max_num_reqs = vllm_config.scheduler_config.max_num_seqs + max_num_pages = max_num_reqs * max_num_pages_per_req + self.enable_cuda_graph = self.compilation_config.full_cuda_graph + if self.enable_cuda_graph: + # For full cudagraph capture, one `decode_wrapper` for each batch + # size is needed for FlashInfer. + self._decode_wrappers_cudagraph: dict[ + int, BatchDecodeWithPagedKVCacheWrapper] = {} + self._decode_cudagraph_max_bs = min( + max_num_reqs, self.compilation_config.max_capture_size) + self._cascade_wrapper = None # Wrapper for cascade attention # Global hyperparameters shared by all attention layers self.global_hyperparameters = infer_global_hyperparameters( get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl)) - self.vllm_config = vllm_config - self.cache_config = vllm_config.cache_config - self.kv_cache_spec = kv_cache_spec - max_num_blocks_per_request = cdiv( - vllm_config.model_config.max_model_len, - self.kv_cache_spec.block_size) - self.block_table_arange = torch.arange(max_num_blocks_per_request, + # Preparing persistent buffers (device-side) + self.paged_kv_indptr = torch.zeros(max_num_reqs + 1, + dtype=torch.int32, + device=self.device) + self.paged_kv_indices = torch.zeros( + max_num_pages, # max num pages possible + dtype=torch.int32, + device=self.device) + self.paged_kv_last_page_len = torch.zeros(max_num_reqs, + dtype=torch.int32, + device=self.device) + # host-side buffer + pin_memory = is_pin_memory_available() + self.paged_kv_indptr_cpu = torch.zeros(max_num_reqs + 1, + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + self.paged_kv_indices_cpu = torch.zeros(max_num_pages, + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + self.paged_kv_last_page_len_cpu = torch.zeros(max_num_reqs, + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + + self.block_table_arange = torch.arange(max_num_pages_per_req, dtype=torch.int32, device=self.device) @@ -217,8 +259,16 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self._get_workspace_buffer(), get_kv_cache_layout()) return self._prefill_wrapper - def _get_decode_wrapper(self): - if self._decode_wrapper is None: + def _get_decode_wrapper(self, + batch_size: int, + use_cudagraph: bool = False): + if use_cudagraph: + decode_wrapper = self._decode_wrappers_cudagraph.get( + batch_size, None) + else: + decode_wrapper = self._decode_wrapper + + if decode_wrapper is None: num_qo_heads = ( self.vllm_config.model_config.get_num_attention_heads( self.vllm_config.parallel_config)) @@ -226,11 +276,32 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.vllm_config.parallel_config) use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or ( num_qo_heads // num_kv_heads > 4) - self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( + + if use_cudagraph: + paged_kv_indptr = self.paged_kv_indptr[:batch_size + 1] + paged_kv_indices = self.paged_kv_indices + paged_kv_last_page_len = self.paged_kv_last_page_len[: + batch_size] + else: + paged_kv_indptr = None + paged_kv_indices = None + paged_kv_last_page_len = None + decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( self._get_workspace_buffer(), get_kv_cache_layout(), + use_cuda_graph=use_cudagraph, + paged_kv_indptr_buffer=paged_kv_indptr, + paged_kv_indices_buffer=paged_kv_indices, + paged_kv_last_page_len_buffer=paged_kv_last_page_len, use_tensor_cores=use_tensor_cores) - return self._decode_wrapper + + # save the decode wrapper + if use_cudagraph: + self._decode_wrappers_cudagraph[batch_size] = decode_wrapper + else: + self._decode_wrapper = decode_wrapper + + return decode_wrapper def _get_cascade_wrapper(self): if self._cascade_wrapper is None: @@ -308,16 +379,44 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) if num_decodes > 0: - attn_metadata.decode_wrapper = self._get_decode_wrapper() + pure_decode = num_prefills == 0 + # possible required padding for cudagraph replay + use_cudagraph = (self.enable_cuda_graph and pure_decode and + num_decodes <= self._decode_cudagraph_max_bs) + if use_cudagraph: + num_input_tokens = ( + self.vllm_config.pad_for_cudagraph(num_decodes)) + # Carefully fulfill the padding region with reasonable value + # on cpu. + # Make sure paged_kv_indptr_cpu is not decreasing + self.paged_kv_indptr_cpu[1 + num_decodes:1 + + num_input_tokens].fill_( + attn_metadata. + paged_kv_indptr_cpu[-1]) + # Fill the remaining paged_kv_last_page_len_cpu with 1. + # This is because flashinfer treats 0 as a full page + # instead of empty. + self.paged_kv_last_page_len_cpu[ + num_decodes:num_input_tokens].fill_(1) + + else: + num_input_tokens = num_decodes + + attn_metadata.decode_wrapper = self._get_decode_wrapper( + num_input_tokens, use_cudagraph) if not use_trtllm_decode_attention( num_decodes, attn_metadata.max_seq_len, self.cache_config.cache_dtype, attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim): - attn_metadata.decode_wrapper.plan( - attn_metadata.paged_kv_indptr_cpu[:num_decodes + 1], + # Use the persistent buffer with padding length, + # instead of the same address but chunked version + # in atten_metadata when using cudagraph. + fast_plan_decode( + attn_metadata.decode_wrapper, + self.paged_kv_indptr_cpu[:num_input_tokens + 1], attn_metadata.paged_kv_indices, - attn_metadata.paged_kv_last_page_len_cpu[:num_decodes], + self.paged_kv_last_page_len_cpu[:num_input_tokens], attn_metadata.num_qo_heads, attn_metadata.num_kv_heads, attn_metadata.head_dim, @@ -336,6 +435,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, fast_build: bool = False) -> FlashInferMetadata: + num_reqs = common_attn_metadata.num_reqs num_actual_tokens = common_attn_metadata.num_actual_tokens num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens =\ split_decodes_and_prefills(common_attn_metadata) @@ -381,18 +481,26 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): non_blocking=True) mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0) < block_table_bounds.unsqueeze(1)) - paged_kv_indices = block_table_tensor[:, :max_num_blocks][mask] + # write self.paged_kv_indices inplace + num_actual_pages = torch.sum(mask) + paged_kv_indices = self.paged_kv_indices[:num_actual_pages] + torch.masked_select(block_table_tensor[:, :max_num_blocks], + mask, + out=paged_kv_indices) - paged_kv_indptr_cpu = torch.zeros(len(block_table_bounds_cpu) + 1, - dtype=torch.int32, - device='cpu') - paged_kv_indptr_cpu[1:] = block_table_bounds_cpu.cumsum( - dim=0, dtype=torch.int32) + # write self.paged_kv_indptr_cpu inplace (0-index is always 0) + torch.cumsum(block_table_bounds_cpu, + dim=0, + dtype=torch.int32, + out=self.paged_kv_indptr_cpu[1:1 + num_reqs]) paged_kv_last_page_len_cpu = seq_lens_cpu % page_size - paged_kv_last_page_len_cpu = torch.where( - paged_kv_last_page_len_cpu == 0, page_size, - paged_kv_last_page_len_cpu) + # write self.paged_kv_last_page_len_cpu inplace + torch.where(paged_kv_last_page_len_cpu == 0, + torch.tensor(page_size), + paged_kv_last_page_len_cpu, + out=self.paged_kv_last_page_len_cpu[:num_reqs]) + cache_dtype = self.cache_config.cache_dtype if cache_dtype.startswith("fp8"): kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer( @@ -402,9 +510,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu, - paged_kv_indptr_cpu=paged_kv_indptr_cpu, + paged_kv_indptr_cpu=self.paged_kv_indptr_cpu[:1 + num_reqs], paged_kv_indices=paged_kv_indices, - paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu, + paged_kv_last_page_len_cpu=self. + paged_kv_last_page_len_cpu[:num_reqs], num_qo_heads=self.vllm_config.model_config.get_num_attention_heads( self.vllm_config.parallel_config), num_kv_heads=self.kv_cache_spec.num_kv_heads, @@ -431,6 +540,26 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): return attn_metadata + def build_for_cudagraph_capture( + self, common_attn_metadata: CommonAttentionMetadata): + """ + This method builds the metadata for full cudagraph capture. + Currently, only decode is supported for full cudagraphs with FlashInfer. + """ + m = common_attn_metadata + + assert m.num_reqs == m.num_actual_tokens, \ + "FlashInfer only supports decode-only full CUDAGraph capture. " \ + "Make sure all cudagraph capture sizes <= max_num_seq." + + m.max_query_len = 1 # decode-only + + return self.build(0, m) + + def can_run_in_cudagraph( + self, common_attn_metadata: CommonAttentionMetadata) -> bool: + return common_attn_metadata.max_query_len == 1 + def use_cascade_attention(self, *args, **kwargs) -> bool: if self.kv_cache_spec.dtype != self.vllm_config.model_config.dtype: # TODO: The cascade wrapper currently does not support setting @@ -638,3 +767,163 @@ class FlashInferImpl(AttentionImpl): out=output[:num_decode_tokens], ) return output_padded + + +def fast_plan_decode( + self, # decode wrapper + indptr_cpu: torch.Tensor, + indices: torch.Tensor, + last_page_len_cpu: torch.Tensor, + num_qo_heads: int, + num_kv_heads: int, + head_dim: int, + page_size: int, + pos_encoding_mode: str = "NONE", + window_left: int = -1, + logits_soft_cap: Optional[float] = None, + q_data_type: Optional[Union[str, torch.dtype]] = "float16", + kv_data_type: Optional[Union[str, torch.dtype]] = None, + data_type: Optional[Union[str, torch.dtype]] = None, + sm_scale: Optional[float] = None, + rope_scale: Optional[float] = None, + rope_theta: Optional[float] = None, + non_blocking: bool = True, +) -> None: + """ + A faster version of BatchDecodeWithPagedKVCacheWrapper::plan used for + cudagraph capture/replay, while the no cudagraph version turns back + to the original plan. + using original plan after passing host-side buffers: + - only host-to-device copy of indptr and last_page_len buffers + Modifications for cudagraph: + - only host-to-device copy of indptr and last_page_len buffers. + - avoid device-to-device copy of indices buffer. + + Part of the code get inspiration from the original plan from FlashInfer repo + and the implementation of fast_decode_plan for FlashInfer in SGlang repo. + """ + # Warm up with the original plan if it is first call, and always run the + # original plan if we run for dynamic shape. For fixed shape (cudagraph), + # this warm up is to generate the _cached_module for the decode wrapper. + if not self.is_cuda_graph_enabled or \ + getattr(self, "vllm_first_call", True): + self.plan( + indptr_cpu, + indices, + last_page_len_cpu, + num_qo_heads, + num_kv_heads, + head_dim, + page_size, + pos_encoding_mode, + window_left, + logits_soft_cap, + q_data_type, + kv_data_type, + data_type, + sm_scale, + rope_scale, + rope_theta, + non_blocking, + ) + self.vllm_first_call = False + return + + assert self.is_cuda_graph_enabled, "Should be cudagraph only here" + + batch_size = len(last_page_len_cpu) + if logits_soft_cap is None: + logits_soft_cap = 0.0 + + # Handle data types consistently + if data_type is not None: + if q_data_type is None: + q_data_type = data_type + if kv_data_type is None: + kv_data_type = data_type + elif q_data_type is None: + q_data_type = "float16" + + if kv_data_type is None: + kv_data_type = q_data_type + q_data_type = getattr(torch, q_data_type) if isinstance( + q_data_type, str) else q_data_type + kv_data_type = getattr(torch, kv_data_type) if isinstance( + kv_data_type, str) else kv_data_type + + if self.use_tensor_cores: + qo_indptr_host = _get_range_buf(batch_size + 1, "cpu") + + if batch_size != self._fixed_batch_size: + raise ValueError( + "The batch size should be fixed in cudagraph mode, the runtime " + "batch size {} mismatches the batch size set during " + "initialization {}".format(batch_size, self._fixed_batch_size)) + if len(indices) > len(self._paged_kv_indices_buf): + raise ValueError( + "The size of indices should be less than or equal to the " + "allocated buffer") + + # host-to-device copy for the indptr buffer + self._paged_kv_indptr_buf.copy_(indptr_cpu, non_blocking=True) + # host-to-device copy for the last_page_len buffer + self._paged_kv_last_page_len_buf.copy_(last_page_len_cpu, + non_blocking=True) + + indptr_host = indptr_cpu + last_page_len_host = last_page_len_cpu + + if self.use_tensor_cores: + kv_lens_arr_host = get_seq_lens(indptr_host, last_page_len_host, + page_size) + + try: + # Make sure we pass exactly 15 arguments for tensor core version + self._plan_info = self._cached_module.plan( + self._float_workspace_buffer, + self._int_workspace_buffer, + self._pin_memory_int_workspace_buffer, + qo_indptr_host, + indptr_host, + kv_lens_arr_host, + batch_size, # total_num_rows + batch_size, + num_qo_heads, + num_kv_heads, + page_size, + self.is_cuda_graph_enabled, + head_dim, + head_dim, + False, # causal + ) + except Exception as e: + raise RuntimeError(f"Error in tensor core plan: {e}") from e + else: + try: + # Make sure we pass exactly 15 arguments for standard version + self._plan_info = self._cached_module.plan( + self._float_workspace_buffer, + self._int_workspace_buffer, + self._pin_memory_int_workspace_buffer, + indptr_host, + batch_size, + num_qo_heads, + num_kv_heads, + page_size, + self.is_cuda_graph_enabled, + window_left, + logits_soft_cap, + head_dim, + head_dim, + torch.empty(0, dtype=q_data_type), + torch.empty(0, dtype=kv_data_type), + ) + except Exception as e: + raise RuntimeError(f"Error in standard plan: {e}") from e + + self._pos_encoding_mode = pos_encoding_mode + self._window_left = window_left + self._logits_soft_cap = logits_soft_cap + self._sm_scale = sm_scale + self._rope_scale = rope_scale + self._rope_theta = rope_theta diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 39463b9c06164..b5aecff9937f3 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -18,6 +18,7 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder) +from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) @@ -54,7 +55,8 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]): class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): - full_cudagraph_supported: ClassVar[bool] = True # Decode-only + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.PURE_DECODE_ONLY def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 5c5891f035ae2..8b55e1a301992 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -17,6 +17,7 @@ from vllm.v1.attention.backends.mla.common import (MLACommonBackend, MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder) +from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec # yapf: enable @@ -64,7 +65,8 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): - full_cudagraph_supported: ClassVar[bool] = True # decode only + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.PURE_DECODE_ONLY def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 195fbd3b1b9c4..942cb95eefa2f 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -18,7 +18,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, +from vllm.v1.attention.backends.utils import (AttentionCGSupport, + AttentionMetadataBuilder, CommonAttentionMetadata) from vllm.v1.kv_cache_interface import AttentionSpec @@ -57,7 +58,8 @@ class TritonAttentionMetadata: class TritonAttentionMetadataBuilder( AttentionMetadataBuilder[TritonAttentionMetadata]): - full_cudagraph_supported: ClassVar[bool] = True + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.ALWAYS def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], vllm_config: VllmConfig, device: torch.device): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 36bacf0cb36f8..d39cc0a39f45c 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import abc +import enum import functools from abc import abstractmethod from dataclasses import dataclass, make_dataclass @@ -65,9 +66,24 @@ class CommonAttentionMetadata: M = TypeVar("M") +class AttentionCGSupport(enum.Enum): + """ Constants for the cudagraph support of the attention backend + Here we do not consider the cascade attention, as currently + it is never cudagraph supported.""" + + NEVER = 0 + """NO cudagraph support""" + PURE_DECODE_ONLY = 1 + """Cudagraph supported for pure decode, need to run without + cudagraph for mixed prefill-decode batches""" + ALWAYS = 2 + """Cudagraph always supported""" + + class AttentionMetadataBuilder(abc.ABC, Generic[M]): # Does this backend/builder support CUDA Graphs for attention. - full_cudagraph_supported: ClassVar[bool] = False + attn_cudagraph_support: ClassVar[AttentionCGSupport] = \ + AttentionCGSupport.NEVER @abstractmethod def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str], diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 29cda4d837bf3..d5a5799efb47c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -47,7 +47,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, is_pin_memory_available, round_up, supports_dynamo) from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend from vllm.v1.attention.backends.utils import ( - AttentionMetadataBuilder, CommonAttentionMetadata, + AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, make_kv_sharing_fast_prefill_attention_metadata, make_local_attention_virtual_batches) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -2619,12 +2619,22 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.device, ) - if (self.full_cuda_graph - and not attn_metadata_builder_i.full_cudagraph_supported): - raise ValueError( - f"Full CUDAGraph not supported for " - f"{attn_backend_i.__name__}. Turn off CompilationConfig." - f"full_cuda_graph or use a different attention backend.") + if self.full_cuda_graph: + if attn_metadata_builder_i.attn_cudagraph_support == \ + AttentionCGSupport.NEVER: + raise ValueError(f"Full CUDAGraph not supported for " + f"{attn_backend_i.__name__}. Turn off " + f"CompilationConfig.full_cuda_graph or use a " + f" different attention backend.") + if attn_metadata_builder_i.attn_cudagraph_support == \ + AttentionCGSupport.PURE_DECODE_ONLY: + # Limit the max cudagraph size to the max number of + # sequences for pure decode only cudagraph backend, + # whose max_query_len is 1. + self.cudagraph_batch_sizes = [ + size for size in self.cudagraph_batch_sizes + if size <= self.scheduler_config.max_num_seqs + ] return attn_backend_i, attn_metadata_builder_i def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0f46ed223ab88..4bc4ece9a0df4 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -321,11 +321,16 @@ class Worker(WorkerBase): if get_pp_group().is_last_rank: max_num_reqs = min(self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens) + # activate building attn_metadata for this dummy run to avoid + # potential illegal memory access for full cudagraph relay. + attn_cudagraph = self.compilation_config.full_cuda_graph and\ + not self.model_config.enforce_eager # We skip EPLB here since we don't want to record dummy metrics hidden_states, last_hidden_states = \ self.model_runner._dummy_run( num_tokens=max_num_reqs, + capture_attn_cudagraph=attn_cudagraph, skip_eplb=True, ) if self.model_runner.is_pooling_model: From ee2eb6ecd86be4b47e334f74feb7874b9a41ca25 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 2 Aug 2025 10:34:37 +0800 Subject: [PATCH 49/54] [Model] Qwen2.5 VL SiLU-and-Mul (#22066) Signed-off-by: kf Signed-off-by: vllmellm Co-authored-by: kf --- vllm/model_executor/models/qwen2_5_vl.py | 44 +++++++++++------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index c4c4650f569e1..04e64422d2e0b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -43,9 +43,10 @@ from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig @@ -171,16 +172,12 @@ class Qwen2_5_VisionMLP(nn.Module): quant_config: Optional[QuantizationConfig] = None, prefix: str = ""): super().__init__() - self.gate_proj = ColumnParallelLinear(in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.gate_proj") - self.up_proj = ColumnParallelLinear(in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.up_proj") + self.gate_up_proj = MergedColumnParallelLinear( + input_size=in_features, + output_sizes=[hidden_features] * 2, # [gate_proj, up_proj] + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(hidden_features, in_features, bias=bias, @@ -189,10 +186,9 @@ class Qwen2_5_VisionMLP(nn.Module): self.act_fn = act_fn def forward(self, x: torch.Tensor): - x_gate, _ = self.gate_proj(x) - x_gate = self.act_fn(x_gate) - x_up, _ = self.up_proj(x) - x_down, _ = self.down_proj(x_gate * x_up) + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x_down, _ = self.down_proj(x) return x_down @@ -540,14 +536,14 @@ class Qwen2_5_VisionTransformer(nn.Module): self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2) self.blocks = nn.ModuleList([ - Qwen2_5_VisionBlock( - dim=self.hidden_size, - num_heads=self.num_heads, - mlp_hidden_dim=vision_config.intermediate_size, - act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act], - norm_layer=norm_layer, - quant_config=quant_config, - prefix=f"{prefix}.blocks.{layer_idx}") + Qwen2_5_VisionBlock(dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=get_act_and_mul_fn( + vision_config.hidden_act), + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") for layer_idx in range(depth) ]) self.merger = Qwen2_5_VisionPatchMerger( @@ -752,6 +748,8 @@ class Qwen2_5_VisionTransformer(nn.Module): ("attn.qkv.", "attn.q.", "q"), ("attn.qkv.", "attn.k.", "k"), ("attn.qkv.", "attn.v.", "v"), + ("mlp.gate_up_proj.", "mlp.gate_proj.", 0), + ("mlp.gate_up_proj.", "mlp.up_proj.", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: set[str] = set() From 57393715e804387588241fbdb4ec94a7570230b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 2 Aug 2025 04:41:40 +0200 Subject: [PATCH 50/54] [Misc] `VLLM_TARGET_DEVICE.lower()` (#22101) Signed-off-by: NickLucche --- vllm/envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 2fda2903179b5..c161fa0dff6ba 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -213,7 +213,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Target device of vLLM, supporting [cuda (by default), # rocm, neuron, cpu] "VLLM_TARGET_DEVICE": - lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), + lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(), # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs From a65f46be5ea9a92dde48df2b951c1915aa1d9595 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 2 Aug 2025 08:12:03 +0530 Subject: [PATCH 51/54] [Misc] DeepGemmExperts : Avoid JIT generation in the hot-path (#21955) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- vllm/envs.py | 9 +++ .../layers/fused_moe/deep_gemm_moe.py | 77 ++++++++++++++++++- vllm/utils/deep_gemm.py | 7 ++ 3 files changed, 92 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index c161fa0dff6ba..2d470c6dccbfd 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -126,6 +126,7 @@ if TYPE_CHECKING: VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None VLLM_TPU_USING_PATHWAYS: bool = False VLLM_USE_DEEP_GEMM: bool = False + VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False VLLM_XGRAMMAR_CACHE_MB: int = 0 @@ -910,6 +911,14 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), + # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm + # JIT all the required kernels before model execution so there is no + # JIT'ing in the hot-path. However, this warmup increases the engine + # startup time by a couple of minutes. + # Set `VLLM_SKIP_DEEP_GEMM_WARMUP` to disable the warmup. + "VLLM_SKIP_DEEP_GEMM_WARMUP": + lambda: bool(int(os.getenv("VLLM_SKIP_DEEP_GEMM_WARMUP", "0"))), + # Allow use of FlashInfer MoE kernels for fused moe ops. "VLLM_USE_FLASHINFER_MOE_FP8": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))), diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index b89e5ac6f093e..bd3605378b6dc 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -4,7 +4,9 @@ import functools from typing import Any, Optional import torch +from tqdm import tqdm +import vllm.envs as env import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig @@ -17,7 +19,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) -from vllm.utils import has_deep_gemm +from vllm.utils import has_deep_gemm, run_once from vllm.utils.deep_gemm import m_grouped_fp8_gemm_nt_contiguous logger = init_logger(__name__) @@ -82,6 +84,65 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, return True +@run_once +def warmup_deepgemm_gg_contiguous_kernels(w1: torch.Tensor, w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + num_topk: int): + """ + DeepGemm JITs the grouped-gemm kernels. The JIT'ing happens based on the + input tensor shapes. In this function, we construct all possible input + tensor shapes so all the kernels are JIT'ed and cached. + Note that this warmup is expected to happen during the model profile + call and not during actual model inference. + """ + + assert w1.size(0) == w2.size(0), ( + "w1 and w2 must have the same number of experts") + + block_m = deep_gemm_block_shape()[0] + num_experts = w1.size(0) + device = w1.device + + # This is the maximum GroupedGemm M size that we expect to run + # the grouped_gemm with. + MAX_M = compute_aligned_M(env.VLLM_FUSED_MOE_CHUNK_SIZE, + num_topk, + num_experts, + block_m, + expert_tokens_meta=None) + # Distribute expert-ids evenly. + MAX_BLOCKS = MAX_M // block_m + expert_ids_block = torch.randint(low=0, + high=num_experts, + size=(MAX_BLOCKS, ), + device=device, + dtype=torch.int32) + expert_ids = torch.repeat_interleave(expert_ids_block, block_m, dim=0) + + def _warmup(w: torch.Tensor, w_scale: torch.Tensor): + + _, n, k = w.size() + a1q = torch.empty((MAX_M, k), device=device).to(torch.float8_e4m3fn) + a1q_scales = torch.empty((MAX_M, k // block_m), + device=device, + dtype=torch.float32) + out = torch.empty((MAX_M, n), device=device, dtype=torch.bfloat16) + + pbar = tqdm(total=MAX_BLOCKS, + desc=f"DeepGemmExperts GEMM warmup (MAX_M={MAX_M})") + num_tokens = MAX_M + while num_tokens > 0: + m_grouped_fp8_gemm_nt_contiguous( + (a1q[:num_tokens], a1q_scales[:num_tokens]), (w, w_scale), + out[:num_tokens], expert_ids[:num_tokens]) + pbar.update(1) + num_tokens = num_tokens - block_m + + _warmup(w1, w1_scale) + _warmup(w2, w2_scale) + + class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): def __init__(self): @@ -156,6 +217,20 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): ): assert self.block_shape is not None assert a1q_scale is not None + assert w1_scale is not None + assert w2_scale is not None + + if not env.VLLM_SKIP_DEEP_GEMM_WARMUP: + # DeepGemm JITs the grouped-gemm kernels. We don't want the JIT'ing + # to happen during actual model-inference. The + # `warmup_deepgemm_kernels` function is a `run_once` decorated + # function that executes during the model profile run. This warmup + # should create all the required JITs for the current model. + warmup_deepgemm_gg_contiguous_kernels(w1, + w2, + w1_scale, + w2_scale, + num_topk=topk_ids.size(1)) a1q = hidden_states _, N, K = w1.size() diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 4dedee2a3f862..8ab34e7505ee2 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -8,6 +8,7 @@ from __future__ import annotations import functools import importlib +import os from typing import Any, Callable, NoReturn import torch @@ -77,6 +78,12 @@ def _lazy_init() -> None: if not has_deep_gemm(): return + # Set up deep_gemm cache path + DEEP_GEMM_JIT_CACHE_ENV_NAME = 'DG_JIT_CACHE_DIR' + if not os.environ.get(DEEP_GEMM_JIT_CACHE_ENV_NAME, None): + os.environ[DEEP_GEMM_JIT_CACHE_ENV_NAME] = os.path.join( + envs.VLLM_CACHE_ROOT, "deep_gemm") + _dg = importlib.import_module("deep_gemm") _fp8_gemm_nt_impl = _resolve_symbol(_dg, "fp8_gemm_nt", From 9f9c38c392476fd35b9154221c00a2255dcfd010 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Fri, 1 Aug 2025 22:43:37 -0400 Subject: [PATCH 52/54] [Speculators][Speculative Decoding] Add Qwen Eagle3 Support (#21835) Signed-off-by: Dipika Sikka --- .../speculators/test_eagle3.py | 14 +++++++++++-- vllm/config.py | 15 ++++++++++--- vllm/model_executor/models/qwen2.py | 21 +++++++++++++------ vllm/model_executor/models/qwen3.py | 7 +++++++ 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/tests/speculative_decoding/speculators/test_eagle3.py b/tests/speculative_decoding/speculators/test_eagle3.py index c58fc8c0dc5f4..c46ac7a88b751 100644 --- a/tests/speculative_decoding/speculators/test_eagle3.py +++ b/tests/speculative_decoding/speculators/test_eagle3.py @@ -6,11 +6,21 @@ import torch @pytest.mark.parametrize( "model_path", - [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717"), - ("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) + [("nm-testing/SpeculatorLlama3-1-8B-Eagle3-converted-0717-quantized")]) def test_llama(vllm_runner, example_prompts, model_path): with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens=20) print(vllm_outputs) assert vllm_outputs + + +@pytest.mark.parametrize( + "model_path", + [("nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized")]) +def test_qwen(vllm_runner, example_prompts, model_path): + with vllm_runner(model_path, dtype=torch.bfloat16) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens=20) + print(vllm_outputs) + assert vllm_outputs diff --git a/vllm/config.py b/vllm/config.py index dabb4b524dfd8..95dae4275edf3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3175,10 +3175,19 @@ class SpeculativeConfig: "speculative decoding is > 1, but got " f"{self.disable_by_batch_size=}") - if self.method == "eagle3" and self.target_model_config and \ - "llama" not in self.target_model_config.hf_text_config.model_type: + from vllm.transformers_utils.configs import SpeculatorsConfig + + eagle3_target_supported = ["llama"] + if self.draft_model_config and isinstance( + self.draft_model_config.hf_config, SpeculatorsConfig): + eagle3_target_supported.append("qwen") + + if self.method == "eagle3" and self.target_model_config and not any( + supported_model in + self.target_model_config.hf_text_config.model_type + for supported_model in eagle3_target_supported): raise ValueError( - "Eagle3 is only supported for Llama models. " + f"Eagle3 is only supported for {eagle3_target_supported} models. " # noqa: E501 f"Got {self.target_model_config.hf_text_config.model_type=}") return self diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 23f65b99c22ce..0e7507a4570be 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -330,6 +330,8 @@ class Qwen2Model(nn.Module): else: self.norm = PPMissingLayer() + self.aux_hidden_state_layers: tuple[int] = tuple() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.embed_tokens(input_ids) @@ -350,18 +352,25 @@ class Qwen2Model(nn.Module): assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - for layer in self.layers[self.start_layer:self.end_layer]: - hidden_states, residual = layer( - positions, - hidden_states, - residual, - ) + + aux_hidden_states = [] + for idx, layer in enumerate( + self.layers[self.start_layer:self.end_layer]): + if idx in self.aux_hidden_state_layers: + aux_hidden_states.append(hidden_states + residual) + hidden_states, residual = layer(positions, hidden_states, residual) + if not get_pp_group().is_last_rank: return IntermediateTensors({ "hidden_states": hidden_states, "residual": residual }) + hidden_states, _ = self.norm(hidden_states, residual) + + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states + return hidden_states def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 393ce41a91a00..d2ae8959b103d 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -288,6 +288,13 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def set_aux_hidden_state_layers(self, layers: tuple[int]) -> None: + self.model.aux_hidden_state_layers = layers + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]: + num_layers = len(self.model.layers) + return (2, num_layers // 2, num_layers - 3) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) From 8d524ce79ffd0571d6a576cb9a5c21feab187246 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 2 Aug 2025 03:45:27 +0100 Subject: [PATCH 53/54] [BugFix] Improve internal DP load balancing (#21617) Signed-off-by: Nick Hill --- vllm/entrypoints/openai/api_server.py | 3 + vllm/v1/engine/async_llm.py | 4 + vllm/v1/engine/coordinator.py | 110 +++++++++++++++++--------- vllm/v1/engine/core.py | 13 +-- vllm/v1/engine/core_client.py | 46 +++++++---- vllm/v1/metrics/stats.py | 4 + vllm/v1/utils.py | 1 + 7 files changed, 122 insertions(+), 59 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b8ec5461f7719..9bf4702320788 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -199,6 +199,8 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None + client_count = client_config.pop( + "client_count") if client_config else 1 client_index = client_config.pop( "client_index") if client_config else 0 try: @@ -208,6 +210,7 @@ async def build_async_engine_client_from_engine_args( enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats, client_addresses=client_config, + client_count=client_count, client_index=client_index) # Don't keep the dummy data in memory diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 308ca32105ba9..45f450291ab63 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -57,6 +57,7 @@ class AsyncLLM(EngineClient): start_engine_loop: bool = True, stat_loggers: Optional[list[StatLoggerFactory]] = None, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0, ) -> None: """ @@ -120,6 +121,7 @@ class AsyncLLM(EngineClient): executor_class=executor_class, log_stats=self.log_stats, client_addresses=client_addresses, + client_count=client_count, client_index=client_index, ) @@ -156,6 +158,7 @@ class AsyncLLM(EngineClient): enable_log_requests: bool = False, disable_log_stats: bool = False, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0, disable_log_requests: bool = True, # Deprecated, will be removed ) -> "AsyncLLM": @@ -176,6 +179,7 @@ class AsyncLLM(EngineClient): log_stats=not disable_log_stats, usage_context=usage_context, client_addresses=client_addresses, + client_count=client_count, client_index=client_index, ) diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index 8d8d1689e61e3..596edfdbe24f8 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy import multiprocessing import time import weakref @@ -65,18 +66,14 @@ class DPCoordinator: # Assume coordinator is colocated with front-end procs when not in # either external or hybrid DP LB mode. + local_only = not (external_lb or hybrid_lb) front_publish_address = get_engine_client_zmq_addr( - local_only=not external_lb and not hybrid_lb, host=host) + local_only=local_only, host=host) local_only_eng = dp_size == parallel_config.data_parallel_size_local back_publish_address = get_engine_client_zmq_addr(local_only_eng, host) back_output_address = get_engine_client_zmq_addr(local_only_eng, host) - # When in external LB mode, load stats aren't published, only changes - # to request wave / running state, so we don't need to rate-limit the - # updates to the front-end proc(s). - min_stats_update_interval_ms = 0 if external_lb else 100 - context = get_mp_context() self.proc: multiprocessing.Process = context.Process( target=DPCoordinatorProc.run_coordinator, @@ -86,7 +83,6 @@ class DPCoordinator: "front_publish_address": front_publish_address, "back_output_address": back_output_address, "back_publish_address": back_publish_address, - "min_stats_update_interval_ms": min_stats_update_interval_ms, }, daemon=True) self.proc.start() @@ -125,10 +121,6 @@ class DPCoordinatorProc: self.stats_update_interval_ms = min_stats_update_interval_ms - self.current_wave = 0 - self.engines_running = False - self.stats_changed = False - @staticmethod def run_coordinator( engine_count: int, @@ -155,6 +147,16 @@ class DPCoordinatorProc: decoder = MsgpackDecoder(EngineCoreOutputs) + # For tracking request wave progression. + current_wave = 0 + engines_running = False + + # For tracking request counts for internal load-balancing. + stats_changed = False + last_stats_step = -1 + last_stats_wave = -1 + last_step_counts: Optional[list[list[int]]] = None + with make_zmq_socket( path=front_publish_address, # IPC ctx=self.ctx, @@ -191,21 +193,33 @@ class DPCoordinatorProc: while True: elapsed = int(time.time() * 1000) - last_publish_time # Send at stats_update_interval_ms interval if the stats have - # changed, or otherwise every 4 seconds. + # changed, or otherwise every 5 seconds. wait_for = (self.stats_update_interval_ms - if self.stats_changed else 4000) - events = poller.poll(timeout=max(0, wait_for - elapsed)) + if stats_changed else 5000) + + # Wait at least 50ms to ensure we've received all stats for + # the current step. + min_timeout = 50 if last_step_counts is None else 0 + + events = poller.poll(timeout=max(min_timeout, wait_for - + elapsed)) if not events: # Poller timeout - publish current stats to front-ends. - engine_req_counts_list = self._get_engine_counts() - to_publish = (engine_req_counts_list, self.current_wave, - self.engines_running) + if last_step_counts is not None: + engine_req_counts_list = last_step_counts + last_step_counts = None + else: + engine_req_counts_list = self._get_engine_counts() + stats_changed = False + + to_publish = (engine_req_counts_list, current_wave, + engines_running) publish_front.send(msgspec.msgpack.encode(to_publish)) last_publish_time = int(time.time() * 1000) - self.stats_changed = False continue events = dict(events) + wave_state_changed = False if publish_front in events: buffer = publish_front.recv() @@ -232,7 +246,7 @@ class DPCoordinatorProc: # current_wave # we note that 0 is the wave number for the new # engine - self.engines_running = False + engines_running = False logger.info( "DPCoordinator scaled up from %s to %s " "engines", current_count, new_engine_count) @@ -248,15 +262,15 @@ class DPCoordinatorProc: # engines are paused, so that we can wake the other # engines. engine_to_exclude, wave = decoded - if not self.engines_running: - if wave < self.current_wave: + if not engines_running: + if wave < current_wave: # If the wave number is stale, ensure the message # is handled by all the engines. engine_to_exclude = None - self.engines_running = True - self.stats_changed = True - self._send_start_wave(publish_back, self.current_wave, + engines_running = True + wave_state_changed = True + self._send_start_wave(publish_back, current_wave, engine_to_exclude) if output_back in events: @@ -274,36 +288,56 @@ class DPCoordinatorProc: # 1. Updated request load stats - update our local # state with these. stats = self.engines[eng_index].request_counts + stats_step = scheduler_stats.step_counter + stats_wave = scheduler_stats.current_wave + if (stats_wave > last_stats_wave + or stats_wave == last_stats_wave + and stats_step > last_stats_step): + if stats_changed: + last_step_counts = self._get_engine_counts( + do_copy=True) + last_stats_step = stats_step + last_stats_wave = stats_wave + elif stats_wave != last_stats_wave or ( + stats_step != last_stats_step): + logger.warning( + "Received stats for out-of-order " + "step (%d, %d) from engine %d (expected " + "> (%d, %d))", stats_wave, stats_step, + eng_index, last_stats_wave, last_stats_step) stats[0] = scheduler_stats.num_waiting_reqs stats[1] = scheduler_stats.num_running_reqs - self.stats_changed = True + stats_changed = True if (wave := outputs.wave_complete) is not None: # 2. Notification from rank 0 engine that we've # moved into the global paused state # (engines_running==False). - if self.current_wave <= wave: + if current_wave <= wave: new_wave = wave + 1 logger.debug("Moving DP wave from %d to %d.", - self.current_wave, new_wave) - self.current_wave = new_wave - self.engines_running = False - self.stats_changed = True + current_wave, new_wave) + current_wave = new_wave + engines_running = False + wave_state_changed = True elif (wave := outputs.start_wave) is not None and ( - wave > self.current_wave or - (wave == self.current_wave - and not self.engines_running)): + wave > current_wave or + (wave == current_wave and not engines_running)): # 3. The engine received request for a non-current wave # so we must ensure that other engines progress to the # next wave (race condition handling). logger.debug( "Starting wave %d after notification of " "stale wave request from engine.", wave) - self.current_wave = wave - self.engines_running = True - self.stats_changed = True + current_wave = wave + engines_running = True + wave_state_changed = True self._send_start_wave(publish_back, wave, eng_index) + if wave_state_changed: + message = (None, current_wave, engines_running) + publish_front.send(msgspec.msgpack.encode(message)) + @staticmethod def _send_start_wave(socket: zmq.Socket, wave: int, exclude_engine_index: Optional[int]): @@ -316,6 +350,8 @@ class DPCoordinatorProc: socket.send_multipart( (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded)) - def _get_engine_counts(self) -> list[list[int]]: + def _get_engine_counts(self, do_copy=False) -> list[list[int]]: """Return list of [waiting, running] count lists for each engine.""" + if do_copy: + return [copy.copy(e.request_counts) for e in self.engines] return [e.request_counts for e in self.engines] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0a889b2a0a184..79c47e1028882 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -928,7 +928,7 @@ class DPEngineCoreProc(EngineCoreProc): ): # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. - self.counter = 0 + self.step_counter = 0 self.current_wave = 0 self.last_counts = (0, 0) @@ -999,7 +999,9 @@ class DPEngineCoreProc(EngineCoreProc): counts = self.scheduler.get_request_counts() if counts != self.last_counts: self.last_counts = counts - stats = SchedulerStats(*counts) + stats = SchedulerStats(*counts, + step_counter=self.step_counter, + current_wave=self.current_wave) self.output_queue.put_nowait( (-1, EngineCoreOutputs(scheduler_stats=stats))) @@ -1041,15 +1043,16 @@ class DPEngineCoreProc(EngineCoreProc): self.output_queue.put_nowait( (client_index, EngineCoreOutputs(wave_complete=self.current_wave))) + # Increment wave count and reset step counter. self.current_wave += 1 + self.step_counter = 0 def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool: # Optimization - only perform finish-sync all-reduce every 32 steps. - self.counter += 1 - if self.counter != 32: + self.step_counter += 1 + if self.step_counter % 32 != 0: return True - self.counter = 0 return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 26985df6f62df..4d30bb6b74466 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -86,11 +86,12 @@ class EngineCoreClient(ABC): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0, ) -> "MPClient": parallel_config = vllm_config.parallel_config client_args = (vllm_config, executor_class, log_stats, - client_addresses, client_index) + client_addresses, client_count, client_index) if parallel_config.data_parallel_size > 1: if parallel_config.data_parallel_external_lb: # External load balancer - client per DP rank. @@ -727,6 +728,7 @@ class AsyncMPClient(MPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0): super().__init__( asyncio_mode=True, @@ -929,11 +931,12 @@ class DPAsyncMPClient(AsyncMPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0): self.current_wave = 0 super().__init__(vllm_config, executor_class, log_stats, - client_addresses, client_index) + client_addresses, client_count, client_index) # List of [waiting, running] pair per engine. # Used only by DPLBAsyncMPClient subclass. @@ -1029,7 +1032,11 @@ class DPAsyncMPClient(AsyncMPClient): counts, wave, running = msgspec.msgpack.decode(buf) self.current_wave = wave self.engines_running = running - self.lb_engines = counts[count_slice] + if counts is not None: + sliced_counts = counts[count_slice] + self.lb_engines = sliced_counts + logger.debug("Received counts: %s (%s)", sliced_counts, + count_slice) resources.stats_update_task = asyncio.create_task( run_engine_stats_update_task()) @@ -1065,40 +1072,45 @@ class DPLBAsyncMPClient(DPAsyncMPClient): executor_class: type[Executor], log_stats: bool, client_addresses: Optional[dict[str, str]] = None, + client_count: int = 1, client_index: int = 0): + self.client_count = client_count + # To route aborts to the correct engine. self.reqs_in_flight: dict[str, EngineIdentity] = {} super().__init__(vllm_config, executor_class, log_stats, - client_addresses, client_index) + client_addresses, client_count, client_index) assert len(self.core_engines) > 1 + self.eng_start_index = (len(self.core_engines) * + self.client_index) // client_count + def get_core_engine_for_request( self, request: EngineCoreRequest) -> EngineIdentity: # Engines are in rank order. + current_counts = self.lb_engines if (eng_index := request.data_parallel_rank) is None: - if not self.lb_engines: + if not current_counts: return self.core_engine # TODO use P2C alg for larger DP sizes - num_engines = len(self.lb_engines) - min_counts = [sys.maxsize, sys.maxsize] + num_engines = len(current_counts) + min_score = sys.maxsize eng_index = 0 for i in range(num_engines): # Start from client_index to help with balancing when engines # are empty. - idx = (self.client_index + i) % num_engines - counts = self.lb_engines[idx] - if counts < min_counts: - min_counts = counts + idx = (self.eng_start_index + i) % num_engines + waiting, running = current_counts[idx] + score = waiting * 4 + running + if score < min_score: + min_score = score eng_index = idx - # Adjust local counts for better balancing between stats updates - # from the coordinator (which happen every 100ms). - if min_counts[0]: - min_counts[0] += 1 - else: - min_counts[1] += 1 + # Increment local waiting count for better balancing between stats + # updates from the coordinator (which happen every 100ms). + current_counts[eng_index][0] += self.client_count chosen_engine = self.core_engines[eng_index] # Record which engine is chosen for this request, to handle aborts. diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 1eb10ccb6c493..9a80460261e02 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -33,6 +33,10 @@ class SchedulerStats: num_running_reqs: int = 0 num_waiting_reqs: int = 0 + # These are used for internal DP load-balancing. + step_counter: int = 0 + current_wave: int = 0 + kv_cache_usage: float = 0.0 prefix_cache_stats: PrefixCacheStats = field( diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index c74d8c543f76c..d0175695c1d0f 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -154,6 +154,7 @@ class APIServerProcessManager: client_config = { "input_address": in_addr, "output_address": out_addr, + "client_count": num_servers, "client_index": i } if stats_update_address is not None: From 6e8d8c4afbddf725b34ef938616701869f5b3462 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 1 Aug 2025 22:45:46 -0400 Subject: [PATCH 54/54] [Test] Add Unit Test for Batched DeepGEMM (#21559) Signed-off-by: yewentao256 --- tests/kernels/moe/test_batched_deepgemm.py | 103 +++++++++++++++++++++ tests/kernels/moe/test_deepgemm.py | 8 +- vllm/utils/deep_gemm.py | 4 +- 3 files changed, 107 insertions(+), 8 deletions(-) create mode 100644 tests/kernels/moe/test_batched_deepgemm.py diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py new file mode 100644 index 0000000000000..018d4c224f75e --- /dev/null +++ b/tests/kernels/moe/test_batched_deepgemm.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch + +from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( + BatchedDeepGemmExperts) +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedPrepareAndFinalize, BatchedTritonExperts) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.utils.deep_gemm import calc_diff, is_deep_gemm_supported + +from .test_deepgemm import make_block_quant_fp8_weights + +BLOCK_SIZE = [128, 128] + + +@pytest.mark.skipif(not is_deep_gemm_supported(), + reason="Requires deep_gemm kernels") +@pytest.mark.parametrize("E", [16, 32]) # number of experts +@pytest.mark.parametrize("T", [256, 512]) # tokens per expert +@pytest.mark.parametrize("K", [128, 256]) # hidden dim +@pytest.mark.parametrize("N", [512, 1024]) # intermediate dim per expert +@pytest.mark.parametrize("topk", [2, 4]) +def test_batched_deepgemm_vs_triton(E: int, T: int, K: int, N: int, topk: int, + monkeypatch): + """Compare BatchedDeepGemmExperts to BatchedTritonExperts.""" + + monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1") + + device = "cuda" + w1, w2, w1_s, w2_s = make_block_quant_fp8_weights(E, N, K, BLOCK_SIZE) + + M = E * T # total tokens + a = torch.randn(M, K, device=device, dtype=torch.bfloat16) / 10.0 + fp8_info = torch.finfo(torch.float8_e4m3fn) + a.clamp_(fp8_info.min, fp8_info.max) + + # random router outputs → top-k indices / weights + router_logits = torch.randn(M, E, device=device, dtype=torch.float32) + topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1) + topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1) + + # token number for each expert + cnt = torch.bincount(topk_ids.flatten(), minlength=E) + max_cnt = int(cnt.max().item()) + # next power of 2 for max token number + max_num_tokens = 1 << (max_cnt - 1).bit_length() + + prep_finalize = BatchedPrepareAndFinalize( + max_num_tokens=max_num_tokens, + num_local_experts=E, + num_dispatchers=1, + rank=0, + ) + + # triton (reference) + triton_experts = BatchedTritonExperts( + max_num_tokens=max_num_tokens, + num_dispatchers=1, + use_fp8_w8a8=True, + per_act_token_quant=False, + block_shape=BLOCK_SIZE, + ) + mk_triton = FusedMoEModularKernel(prep_finalize, triton_experts) + + out_triton = mk_triton( + hidden_states=a, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + w1_scale=w1_s, + w2_scale=w2_s, + global_num_experts=E, + ) + + # deepgemm + deepgemm_experts = BatchedDeepGemmExperts( + max_num_tokens=max_num_tokens, + num_dispatchers=1, + block_shape=BLOCK_SIZE, + per_act_token_quant=False, + ) + mk_deepgemm = FusedMoEModularKernel(prep_finalize, deepgemm_experts) + + out_deepgemm = mk_deepgemm( + hidden_states=a, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + w1_scale=w1_s, + w2_scale=w2_s, + global_num_experts=E, + ) + + diff = calc_diff(out_deepgemm, out_triton) + assert diff < 1e-3, f"Output diff too large: {diff}" diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index b6ea4ee2324c9..b2b78662c9ded 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -20,11 +20,6 @@ from vllm.utils.deep_gemm import (calc_diff, is_deep_gemm_supported, BLOCK_SIZE = [128, 128] -requires_deep_gemm = pytest.mark.skipif( - not is_deep_gemm_supported(), - reason="Requires deep_gemm kernels", -) - def make_block_quant_fp8_weights( e: int, @@ -152,7 +147,8 @@ NUM_EXPERTS = [32] @pytest.mark.parametrize("mnk", MNKs) @pytest.mark.parametrize("topk", TOPKS) @pytest.mark.parametrize("num_experts", NUM_EXPERTS) -@requires_deep_gemm +@pytest.mark.skipif(not is_deep_gemm_supported(), + reason="Requires deep_gemm kernels") def test_deepgemm_vs_triton(mnk, topk, num_experts, monkeypatch): with monkeypatch.context() as m: diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 8ab34e7505ee2..0edfb01cde9d6 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -23,10 +23,10 @@ def is_deep_gemm_supported() -> bool: """Return ``True`` if DeepGEMM is supported on the current platform. Currently, only Hopper and Blackwell GPUs are supported. """ - supported_arch = current_platform.is_cuda() and ( + is_supported_arch = current_platform.is_cuda() and ( current_platform.is_device_capability(90) or current_platform.is_device_capability(100)) - return has_deep_gemm() and supported_arch + return has_deep_gemm() and is_supported_arch @functools.cache