From 93cabc417cf88210f16ebf4b3c20b9adbe9ed83a Mon Sep 17 00:00:00 2001 From: Ameen Patel Date: Sun, 21 Dec 2025 07:43:01 -0800 Subject: [PATCH 01/31] ci: add nvidia-smi warmup before Prime-RL integration test (#31093) Signed-off-by: AmeenP --- .buildkite/test-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3c823fc872b05..1186acad48ae0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1359,6 +1359,7 @@ steps: - vllm/ - .buildkite/scripts/run-prime-rl-test.sh commands: + - nvidia-smi - bash .buildkite/scripts/run-prime-rl-test.sh - label: DeepSeek V2-Lite Accuracy From b471092d3a983485243dd153777d026bf5f49183 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Sun, 21 Dec 2025 12:37:42 -0500 Subject: [PATCH 02/31] [MoE Refactor][4/N] Marlin Fp8 Mk (#31036) --- tests/quantization/test_fp8.py | 4 + .../model_executor/layers/fused_moe/config.py | 32 +++++++- .../layers/fused_moe/fused_marlin_moe.py | 38 +++++----- .../model_executor/layers/quantization/fp8.py | 74 +++++++++---------- 4 files changed, 85 insertions(+), 63 deletions(-) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 62203186510ce..c9ab24fd439c5 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.quantization.fp8 import ( Fp8Config, Fp8KVCacheMethod, Fp8LinearMethod, + Fp8MoeBackend, Fp8MoEMethod, ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -324,7 +325,10 @@ def test_fp8_reloading( weight_loader=default_weight_loader, ) + # Fp8LinearMethod uses use_marlin + # Fp8MoEMethod uses fp8_backend method.use_marlin = use_marlin + method.fp8_backend = Fp8MoeBackend.MARLIN if use_marlin else None # capture weights format during loading original_metadata = [ diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index a9a2990ca2b53..d581e91f36d03 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( OCP_MX_Scheme, ) from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_triton_kernels from vllm.utils.math_utils import cdiv @@ -39,6 +40,7 @@ if has_triton_kernels(): def _get_config_dtype_str( dtype: torch.dtype, use_fp8_w8a8: bool = False, + use_fp8_w8a16: bool = False, use_int8_w8a16: bool = False, use_int4_w4a16: bool = False, ocp_mx_scheme: str | None = None, @@ -50,6 +52,8 @@ def _get_config_dtype_str( """ if use_fp8_w8a8: return "fp8_w8a8" + elif use_fp8_w8a16: + return "fp8_w8a16" elif use_int8_w8a16: return "int8_w8a16" elif use_int4_w4a16: @@ -319,6 +323,10 @@ class FusedMoEQuantConfig: def use_int8_w8a16(self) -> bool: return self._a1.dtype is None and self._w1.dtype == torch.int8 + @property + def use_fp8_w8a16(self) -> bool: + return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype() + @property def use_int4_w4a16(self) -> bool: return self._a1.dtype is None and self._w1.dtype == "int4" @@ -362,6 +370,7 @@ class FusedMoEQuantConfig: """ return _get_config_dtype_str( use_fp8_w8a8=self.use_fp8_w8a8, + use_fp8_w8a16=self.use_fp8_w8a16, use_int8_w8a16=self.use_int8_w8a16, use_int4_w4a16=self.use_int4_w4a16, ocp_mx_scheme=self.ocp_mx_scheme, @@ -680,7 +689,6 @@ def int4_w4a16_moe_quant_config( ) -> FusedMoEQuantConfig: """ Construct a quant config for 16-bit float activations and int4 weights. - Note: Activations are pre-quantized. """ group_shape = GroupShape(*block_shape) if block_shape is not None else None return FusedMoEQuantConfig( @@ -691,6 +699,27 @@ def int4_w4a16_moe_quant_config( ) +def fp8_w8a16_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for 16-bit float activations and fp8 weights. + """ + group_shape = GroupShape(*block_shape) if block_shape is not None else None + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(), + _a2=FusedMoEQuantDesc(), + _w1=FusedMoEQuantDesc( + current_platform.fp8_dtype(), group_shape, w1_scale, None, None + ), + _w2=FusedMoEQuantDesc( + current_platform.fp8_dtype(), group_shape, w2_scale, None, None + ), + ) + + def int8_w8a16_moe_quant_config( w1_scale: torch.Tensor, w2_scale: torch.Tensor, @@ -700,7 +729,6 @@ def int8_w8a16_moe_quant_config( ) -> FusedMoEQuantConfig: """ Construct a quant config for 16-bit float activations and int8 weights. - Note: Activations are pre-quantized. """ group_shape = GroupShape(*block_shape) if block_shape is not None else None return FusedMoEQuantConfig( diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 92d72b75656cd..295a2a28156ed 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -13,9 +13,6 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( batched_moe_align_block_size, moe_align_block_size, ) -from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP, -) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, TopKWeightAndReduceNoOP, @@ -26,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_moe_intermediate_size, marlin_quant_input, ) +from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types @@ -542,9 +540,11 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): is_k_full: bool = True, ): # TODO (varun) : Enable activation quantization - assert quant_config.use_mxfp4_w4a16 or quant_config.use_int4_w4a16, ( - "Supports only mxfp4_w4a16 or int4_w4a16" - ) + assert ( + quant_config.use_mxfp4_w4a16 + or quant_config.use_int4_w4a16 + or quant_config.use_fp8_w8a16 + ), "Supports only mxfp4_w4a16, int4_w4a16 or fp8_w8a16" self.w13_g_idx = w13_g_idx self.w2_g_idx = w2_g_idx self.w13_g_idx_sort_indices = w13_g_idx_sort_indices @@ -555,11 +555,17 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): @property def quant_type_id(self) -> int: # uint4b8 will be set for int4 weight and float4_e2m1f will be used for mxfp4 - return ( - scalar_types.uint4b8.id - if self.quant_config.use_int4_w4a16 - else scalar_types.float4_e2m1f.id - ) + if self.quant_config.use_int4_w4a16: + return scalar_types.uint4b8.id + elif self.quant_config.use_mxfp4_w4a16: + return scalar_types.float4_e2m1f.id + elif ( + self.quant_config.use_fp8_w8a16 + and current_platform.fp8_dtype() == torch.float8_e4m3fn + ): + return scalar_types.float8_e4m3fn.id + else: + raise NotImplementedError("Unsupported quantization type.") def moe_problem_size( self, @@ -711,16 +717,6 @@ class MarlinExperts(MarlinExpertsBase): ops.moe_sum(input, output) -def modular_marlin_fused_moe( - quant_config: FusedMoEQuantConfig, shared_experts: torch.nn.Module | None = None -) -> mk.FusedMoEModularKernel: - return mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - MarlinExperts(quant_config), - shared_experts, - ) - - class BatchedMarlinExperts(MarlinExpertsBase): def __init__( self, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 4b2438133dd6b..1f770d6d89a13 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -32,8 +32,8 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, RoutingMethodType, fp8_w8a8_moe_quant_config, + fp8_w8a16_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.linear import ( LinearBase, @@ -95,7 +95,6 @@ from vllm.model_executor.parameter import ( ) from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform -from vllm.scalar_type import scalar_types from vllm.utils.deep_gemm import ( is_deep_gemm_e8m0_used, is_deep_gemm_supported, @@ -729,7 +728,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) self.marlin_input_dtype = None - self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN self.flashinfer_moe_backend: FlashinferMoeBackend | None = None if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM: self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM @@ -1048,7 +1046,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight) layer.w13_weight.data = w13_weight.data - if self.use_marlin: + if self.fp8_backend == Fp8MoeBackend.MARLIN: prepare_moe_fp8_layer_for_marlin( layer, False, input_dtype=self.marlin_input_dtype ) @@ -1091,10 +1089,17 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) self.use_inplace = False - elif self.fp8_backend in [Fp8MoeBackend.DEEPGEMM, Fp8MoeBackend.TRITON]: + elif self.fp8_backend in [ + Fp8MoeBackend.DEEPGEMM, + Fp8MoeBackend.TRITON, + Fp8MoeBackend.MARLIN, + ]: from vllm.model_executor.layers.fused_moe import ( TritonOrDeepGemmExperts, ) + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + MarlinExperts, + ) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP, ) @@ -1102,12 +1107,19 @@ class Fp8MoEMethod(FusedMoEMethodBase): config = self.get_fused_moe_quant_config(layer) assert config is not None self.moe_quant_config = config - self.kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - TritonOrDeepGemmExperts( + use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN + allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM + moe_kernel = ( + MarlinExperts(quant_config=self.moe_quant_config) + if use_marlin + else TritonOrDeepGemmExperts( quant_config=self.moe_quant_config, - allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM), - ), + allow_deep_gemm=allow_deep_gemm, + ) + ) + + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), moe_kernel ) self.use_inplace = True @@ -1116,9 +1128,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> mk.FusedMoEPrepareAndFinalize | None: if ( - current_platform.is_xpu() - or self.rocm_aiter_moe_enabled - or self.use_marlin + self.rocm_aiter_moe_enabled + or self.fp8_backend == Fp8MoeBackend.MARLIN or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): return None @@ -1150,7 +1161,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): TritonOrDeepGemmExperts, ) - assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( + assert ( + self.fp8_backend != Fp8MoeBackend.MARLIN + ) and not self.rocm_aiter_moe_enabled, ( "Marlin and ROCm AITER are not supported with all2all yet." ) @@ -1207,8 +1220,12 @@ class Fp8MoEMethod(FusedMoEMethodBase): def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - if self.use_marlin: - return None + if self.fp8_backend == Fp8MoeBackend.MARLIN: + return fp8_w8a16_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + block_shape=self.weight_block_size, + ) return fp8_w8a8_moe_quant_config( w1_scale=( @@ -1314,29 +1331,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) - elif self.use_marlin: - # TODO(rob): convert this to MK. - assert layer.activation == "silu", ( - f"{layer.activation} not supported for Marlin MoE." - ) - result = fused_marlin_moe( - x, - layer.w13_weight, - layer.w2_weight, - None, - None, - layer.w13_weight_scale, - layer.w2_weight_scale, - router_logits, - topk_weights, - topk_ids, - quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - global_num_experts=layer.global_num_experts, - expert_map=layer.expert_map, - input_dtype=self.marlin_input_dtype, - workspace=layer.workspace, - ) else: result = self.kernel( x, @@ -1495,7 +1489,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): replace_parameter(layer, "w2_weight", shuffled_w2) # Rushuffle weights for MARLIN if needed. - if self.use_marlin: + if self.fp8_backend == Fp8MoeBackend.MARLIN: prepare_moe_fp8_layer_for_marlin( layer, False, input_dtype=self.marlin_input_dtype ) From 06d490282f2bab6922137eb5230be9df5ebbe9c4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sun, 21 Dec 2025 12:41:57 -0500 Subject: [PATCH 03/31] [NVFP4][Perf] Tune NVFP4 input quant kernel for small batch size (#30897) Signed-off-by: mgoin --- benchmarks/kernels/bench_nvfp4_quant.py | 177 ++++++++++++++++++ .../activation_nvfp4_quant_fusion_kernels.cu | 5 +- csrc/quantization/fp4/nvfp4_experts_quant.cu | 31 ++- csrc/quantization/fp4/nvfp4_quant_kernels.cu | 62 ++---- csrc/quantization/fp4/nvfp4_utils.cuh | 65 +++---- 5 files changed, 243 insertions(+), 97 deletions(-) create mode 100644 benchmarks/kernels/bench_nvfp4_quant.py diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/bench_nvfp4_quant.py new file mode 100644 index 0000000000000..7517376535925 --- /dev/null +++ b/benchmarks/kernels/bench_nvfp4_quant.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse +import copy +import itertools + +import torch +from weight_shapes import WEIGHT_SHAPES + +from vllm import _custom_ops as ops +from vllm.platforms import current_platform +from vllm.scalar_type import scalar_types +from vllm.triton_utils import triton +from vllm.utils.flashinfer import flashinfer_fp4_quantize + +if not current_platform.has_device_capability(100): + raise RuntimeError("NVFP4 requires compute capability of 10.0 (Blackwell)") + +FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() +FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max + +PROVIDER_CFGS = { + "vllm": dict(backend="vllm", enabled=True), + "flashinfer": dict(backend="flashinfer", enabled=True), +} + +_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] + + +def compute_global_scale(tensor: torch.Tensor) -> torch.Tensor: + """Compute global scale for FP4 quantization.""" + amax = torch.abs(tensor).max().to(torch.float32) + return FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / amax + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096], + x_log=False, + line_arg="provider", + line_vals=_enabled, + line_names=_enabled, + ylabel="us (lower is better)", + plot_name="NVFP4 Input Quantization Latency (us)", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + # Create input tensor + a = torch.randn((M, K), device=device, dtype=dtype) + + # Compute global scale for activation + a_global_scale = compute_global_scale(a) + + quantiles = [0.5, 0.2, 0.8] + + cfg = PROVIDER_CFGS[provider] + + if cfg["backend"] == "vllm": + # vLLM's FP4 quantization + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: ops.scaled_fp4_quant(a, a_global_scale), + quantiles=quantiles, + ) + elif cfg["backend"] == "flashinfer": + # FlashInfer's FP4 quantization + # Use is_sf_swizzled_layout=True to match vLLM's output format + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: flashinfer_fp4_quantize( + a, a_global_scale, is_sf_swizzled_layout=True + ), + quantiles=quantiles, + ) + + # Convert ms to us for better readability at small batch sizes + to_us = lambda t_ms: t_ms * 1000 + return to_us(ms), to_us(max_ms), to_us(min_ms) + + +def prepare_shapes(args): + out = [] + for model, tp_size in itertools.product(args.models, args.tp_sizes): + for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_dim] //= tp_size + KN.append(model) + out.append(KN) + return out + + +def _test_accuracy_once(M: int, K: int, dtype: torch.dtype, device: str): + """Test accuracy between vLLM and FlashInfer FP4 quantization.""" + # Create input tensor + a = torch.randn((M, K), device=device, dtype=dtype) + + # Compute global scale + a_global_scale = compute_global_scale(a) + + # vLLM quantization + vllm_fp4, vllm_scale = ops.scaled_fp4_quant(a, a_global_scale) + + # FlashInfer quantization (with swizzled layout to match vLLM's output) + flashinfer_fp4, flashinfer_scale = flashinfer_fp4_quantize( + a, a_global_scale, is_sf_swizzled_layout=True + ) + flashinfer_scale = flashinfer_scale.view(torch.float8_e4m3fn) + + # Compare outputs + torch.testing.assert_close( + vllm_fp4, + flashinfer_fp4, + ) + print(f"M={M}, K={K}, dtype={dtype}: PASSED") + + +def test_accuracy(): + """Run accuracy tests across various shapes.""" + print("\n" + "=" * 60) + print("Running accuracy tests: vLLM vs FlashInfer") + print("=" * 60) + + device = "cuda" + dtype = torch.bfloat16 + + # Test various batch sizes and hidden dimensions + Ms = [1, 1024] + Ks = [4096] + + for M in Ms: + for K in Ks: + _test_accuracy_once(M, K, dtype, device) + + print("\nAll accuracy tests passed!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark NVFP4 quantization: vLLM vs FlashInfer" + ) + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=list(WEIGHT_SHAPES.keys()), + ) + parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) + parser.add_argument( + "--save-path", + type=str, + default=None, + help="Path to save benchmark results", + ) + parser.add_argument( + "--accuracy", + action="store_true", + help="Run accuracy tests", + ) + args = parser.parse_args() + + if args.accuracy: + test_accuracy() + + for K, N, model in prepare_shapes(args): + print(f"\n{model}, N={N} K={K}") + benchmark.run( + print_data=True, + save_path=args.save_path, + N=N, + K=K, + ) + + print("\nBenchmark finished!") diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index 7539f836ecf37..e0438556dfe5c 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -74,6 +74,9 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + // Get the global scaling factor, which will be applied to the SF. // Note SFScale is the same as next GEMM's alpha, which is // (448.f / (Alpha_A / 6.f)). @@ -101,7 +104,7 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx, colIdx, numCols, SFout); + rowIdx, colIdx, numKTiles, SFout); out_pos = cvt_warp_fp16_to_fp4(out_silu_mul, SFScaleVal, sf_out); diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index 82c53c2375a31..20191a9bc6160 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -25,6 +25,7 @@ #include #include "dispatch_utils.h" +#include "cuda_utils.h" #include "nvfp4_utils.cuh" #include "launch_bounds_utils.h" @@ -44,6 +45,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + int tid = blockIdx.x * blockDim.x + threadIdx.x; int colsPerRow = numCols / CVT_FP4_ELTS_PER_THREAD; @@ -112,17 +116,13 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) // (448.f / (Alpha_A / 6.f)). float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; - int factor = CVT_FP4_SF_VEC_SIZE * 4; - // The actual output_scales dim is computed from the padded numCols. - int32_t numCols_padded = (numCols + factor - 1) / factor * factor; - int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; uint32_t* SFout_in_expert = - SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; + SFout + output_scale_offset_by_experts[expert_idx] * numKTiles; auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx_in_expert, colIdx, numCols, SFout_in_expert); + rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert); out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } @@ -140,6 +140,10 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD); static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + extern __shared__ uint32_t shared_input_offsets[]; // Load input offsets into shared memory. @@ -202,16 +206,13 @@ __global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[expert_idx]; - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numCols_padded = (numCols + factor - 1) / factor * factor; - int numCols_SFout = numCols_padded / CVT_FP4_SF_VEC_SIZE / 4; uint32_t* SFout_in_expert = - SFout + output_scale_offset_by_experts[expert_idx] * numCols_SFout; + SFout + output_scale_offset_by_experts[expert_idx] * numKTiles; auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx_in_expert, colIdx, numCols, SFout_in_expert); + rowIdx_in_expert, colIdx, numKTiles, SFout_in_expert); out_pos = cvt_warp_fp16_to_fp4(in_vec, SFScaleVal, sf_out); } @@ -222,12 +223,8 @@ void quant_impl(void* output, void* output_scale, void* input, void* input_global_scale, void* input_offset_by_experts, void* output_scale_offset_by_experts, int m_topk, int k, int n_experts, cudaStream_t stream) { - // TODO: this multiProcessorCount should be cached. - int device; - cudaGetDevice(&device); - int multiProcessorCount; - cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, - device); + int multiProcessorCount = + get_device_attribute(cudaDevAttrMultiProcessorCount, -1); // Grid, Block size. // Each thread converts 8 values. diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 6d69852bb4e4f..6acadb4cefd2c 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -38,6 +38,12 @@ __host__ __device__ inline Int round_up(Int x, Int y) { return (x + y - 1) / y * y; } +// Compute effective rows for grid configuration with swizzled SF layouts. +inline int computeEffectiveRows(int m) { + constexpr int ROW_TILE = 128; + return round_up(m, ROW_TILE); +} + // Use UE4M3 by default. template __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) @@ -49,6 +55,9 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD, "Vec size is not matched."); + // Precompute SF layout parameter (constant for entire kernel). + int32_t const numKTiles = (numCols + 63) / 64; + int sf_m = round_up(numRows, 128); int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE; int sf_n_int = round_up(sf_n_unpadded, 4) / 4; @@ -79,7 +88,7 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) auto sf_out = cvt_quant_to_fp4_get_sf_out_offset( - rowIdx, colIdx, numCols, SFout); + rowIdx, colIdx, numKTiles, SFout); out_pos = cvt_warp_fp16_to_fp4(in_vec, global_scale, sf_out); @@ -87,43 +96,6 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) } } -template -void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, - int64_t* output, int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, cudaStream_t stream) { - // Grid, Block size. - // Each thread converts 8 values. - dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); - // Get number of blocks per SM - int const numBlocksPerSM = - vllm_runtime_blocks_per_sm(static_cast(block.x)); - dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); - - // Launch the cvt kernel. - if (useUE8M0) { - cvt_fp16_to_fp4<<>>( - m, n, input, SFScale, reinterpret_cast(output), - reinterpret_cast(SFOuput)); - } else { - cvt_fp16_to_fp4<<>>( - m, n, input, SFScale, reinterpret_cast(output), - reinterpret_cast(SFOuput)); - } -} - -// Instantiate the function. -template void invokeFP4Quantization(int m, int n, half const* input, - float const* SFScale, int64_t* output, - int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, - cudaStream_t stream); - -template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input, - float const* SFScale, int64_t* output, - int32_t* SFOuput, bool useUE8M0, - int multiProcessorCount, - cudaStream_t stream); - } // namespace vllm void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, @@ -147,13 +119,19 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output, const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); - // We don't support e8m0 scales at this moment. - bool useUE8M0 = false; + // Grid, Block size. Each thread converts 8 values. + dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); + int effectiveRows = vllm::computeEffectiveRows(m); + dim3 grid(std::min(effectiveRows, multiProcessorCount * numBlocksPerSM)); VLLM_DISPATCH_HALF_TYPES(input.scalar_type(), "nvfp4_quant_kernel", [&] { using cuda_type = vllm::CUDATypeConverter::Type; auto input_ptr = static_cast(input.data_ptr()); - vllm::invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, - sf_out, useUE8M0, multiProcessorCount, stream); + // NOTE: We don't support e8m0 scales at this moment. + vllm::cvt_fp16_to_fp4<<>>( + m, n, input_ptr, input_sf_ptr, reinterpret_cast(output_ptr), + reinterpret_cast(sf_out)); }); } diff --git a/csrc/quantization/fp4/nvfp4_utils.cuh b/csrc/quantization/fp4/nvfp4_utils.cuh index 48e4959de9793..4c91af85e1514 100644 --- a/csrc/quantization/fp4/nvfp4_utils.cuh +++ b/csrc/quantization/fp4/nvfp4_utils.cuh @@ -128,51 +128,42 @@ inline __device__ float reciprocal_approximate_ftz(float a) { return b; } +// Compute SF output offset for swizzled tensor core layout. +// SF layout: [numMTiles, numKTiles, 32, 4, 4] +// Caller must precompute: numKTiles = (numCols + 63) / 64 template -__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx, - int numCols, - SFType* SFout) { +__device__ __forceinline__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset( + int rowIdx, int colIdx, int32_t numKTiles, SFType* SFout) { static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 || CVT_FP4_NUM_THREADS_PER_SF == 2); // One pair of threads write one SF to global memory. // TODO: stage through smem for packed STG.32 // is it better than STG.8 from 4 threads ? - if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) { - // SF vector index (16 elements share one SF in the K dimension). - int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; - int32_t mIdx = rowIdx; - - // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] - // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx] - - int32_t mTileIdx = mIdx / (32 * 4); - // SF vector size 16. - int factor = CVT_FP4_SF_VEC_SIZE * 4; - int32_t numKTiles = (numCols + factor - 1) / factor; - int64_t mTileStride = numKTiles * 32 * 4 * 4; - - int32_t kTileIdx = (kIdx / 4); - int64_t kTileStride = 32 * 4 * 4; - - // M tile layout [32, 4] is column-major. - int32_t outerMIdx = (mIdx % 32); - int64_t outerMStride = 4 * 4; - - int32_t innerMIdx = (mIdx % (32 * 4)) / 32; - int64_t innerMStride = 4; - - int32_t innerKIdx = (kIdx % 4); - int64_t innerKStride = 1; - - // Compute the global offset. - int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride + - outerMIdx * outerMStride + innerMIdx * innerMStride + - innerKIdx * innerKStride; - - return reinterpret_cast(SFout) + SFOffset; + if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF != 0) { + return nullptr; } - return nullptr; + + // SF vector index (16 elements share one SF in the K dimension). + int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF; + int32_t mIdx = rowIdx; + + // Decompose indices using bitwise ops (all divisors are powers of 2). + // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)] + int32_t mTileIdx = mIdx >> 7; // mIdx / 128 + int32_t outerMIdx = mIdx & 31; // mIdx % 32 + int32_t innerMIdx = (mIdx >> 5) & 3; // (mIdx / 32) % 4 + int32_t kTileIdx = kIdx >> 2; // kIdx / 4 + int32_t innerKIdx = kIdx & 3; // kIdx % 4 + + // Compute global SF offset: mTileIdx * (numKTiles * 512) + kTileIdx * 512 + + // outerMIdx * 16 + innerMIdx * 4 + innerKIdx + // Use bitwise OR for non-overlapping lower bits. + int64_t SFOffset = (static_cast(mTileIdx) * numKTiles + kTileIdx) + << 9 | + (outerMIdx << 4) | (innerMIdx << 2) | innerKIdx; + + return reinterpret_cast(SFout) + SFOffset; } // Quantizes the provided PackedVec into the uint32_t output From 9d701e90d82f091395dc32634f2a26132c167bfd Mon Sep 17 00:00:00 2001 From: Steve Westerhouse Date: Sun, 21 Dec 2025 18:41:37 -0600 Subject: [PATCH 04/31] [Doc] Clarify FP8 KV cache computation workflow (#31071) Signed-off-by: westers --- docs/design/paged_attention.md | 42 +++++++++---------- .../quantization/quantized_kvcache.md | 10 +++++ 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/docs/design/paged_attention.md b/docs/design/paged_attention.md index 5cc5878425515..53368ab1a79fa 100644 --- a/docs/design/paged_attention.md +++ b/docs/design/paged_attention.md @@ -139,18 +139,18 @@ token data. const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` -
- ![](../assets/design/paged_attention/query.png){ align="center" alt="query" width="70%" } -
+

+ query +

Each thread defines its own `q_ptr` which points to the assigned query token data on global memory. For example, if `VEC_SIZE` is 4 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains total of 128 elements divided into 128 / 4 = 32 vecs. -
- ![](../assets/design/paged_attention/q_vecs.png){ align="center" alt="q_vecs" width="70%" } -
+

+ q_vecs +

```cpp __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; @@ -187,9 +187,9 @@ key token at different iterations. As shown above, that `k_ptr` points to key token data based on `k_cache` at assigned block, assigned head and assigned token. -
- ![](../assets/design/paged_attention/key.png){ align="center" alt="key" width="70%" } -
+

+ key +

The diagram above illustrates the memory layout for key data. It assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is @@ -202,9 +202,9 @@ iterations. Inside each rectangle, there are a total 32 vecs (128 elements for one token) that will be processed by 2 threads (one thread group) separately. -
- ![](../assets/design/paged_attention/k_vecs.png){ align="center" alt="k_vecs" width="70%" } -
+

+ k_vecs +

```cpp K_vec k_vecs[NUM_VECS_PER_THREAD] @@ -361,17 +361,17 @@ later steps. Now, it should store the normalized softmax result of ## Value -
- ![](../assets/design/paged_attention/value.png){ align="center" alt="value" width="70%" } -
+

+ value +

-
- ![](../assets/design/paged_attention/logits_vec.png){ align="center" alt="logits_vec" width="50%" } -
+

+ logits_vec +

-
- ![](../assets/design/paged_attention/v_vec.png){ align="center" alt="v_vec" width="70%" } -
+

+ v_vec +

Now we need to retrieve the value data and perform dot multiplication with `logits`. Unlike query and key, there is no thread group diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index d26a5e217f314..586117272d3ba 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -17,6 +17,16 @@ The E4M3 format offers higher precision compared to E5M2. However, due to its sm For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel). +### How FP8 KV Cache Works + +The FP8 KV cache implementation follows this workflow: + +1. **Storage**: Key and Value tensors are quantized to FP8 format using scaling factors before being stored in the KV cache +2. **Retrieval**: When needed for attention computation, cached KV tensors are dequantized back to higher precision (FP16/BF16) +3. **Attention**: The attention-value multiplication (softmax output × V) is performed using the dequantized higher-precision V tensor + +This means the final attention computation operates on dequantized values, not FP8 tensors. The quantization reduces memory usage during storage but maintains computation accuracy by using higher precision during the actual attention operations. + ### Performance Impact The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either: From 7e065eba596e1fd273c765358f5cb34468bffb17 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sun, 21 Dec 2025 21:32:40 -0500 Subject: [PATCH 05/31] [CI] Fix "2 Node Tests (4 GPUs in total)" (#31090) Signed-off-by: Lucas Wilkinson --- .buildkite/test-amd.yaml | 4 +- .buildkite/test-pipeline.yaml | 4 +- .buildkite/test_areas/distributed.yaml | 2 +- examples/offline_inference/data_parallel.py | 68 ++++++++++++++------- 4 files changed, 51 insertions(+), 27 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index bd00c47df8cb0..0d7194f003840 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1254,13 +1254,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1186acad48ae0..7b664c4fa15fe 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1109,13 +1109,13 @@ steps: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code + - python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 1a3739cc2417a..65a981a9d6d00 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -171,7 +171,7 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30 diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index bcf1ba307eff1..287409fa2b5c1 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -14,19 +14,19 @@ Multi-node: --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 \ - --nnodes=2 \ - --node-rank=0 \ - --master-addr=10.99.48.128 \ - --master-port=13345 + --dp-num-nodes=2 \ + --dp-node-rank=0 \ + --dp-master-addr=10.99.48.128 \ + --dp-master-port=13345 Node 1: python examples/offline_inference/data_parallel.py \ --model="ibm-research/PowerMoE-3b" \ -dp=2 \ -tp=2 \ - --nnodes=2 \ - --node-rank=1 \ - --master-addr=10.99.48.128 \ - --master-port=13345 + --dp-num-nodes=2 \ + --dp-node-rank=1 \ + --dp-master-addr=10.99.48.128 \ + --dp-master-port=13345 """ import os @@ -48,7 +48,31 @@ def create_parser(): enable_expert_parallel=True, ) - # Add timeout (not in EngineArgs) + # Add DP-specific args (separate from engine args to avoid conflicts) + parser.add_argument( + "--dp-num-nodes", + type=int, + default=1, + help="Total number of nodes for data parallel.", + ) + parser.add_argument( + "--dp-node-rank", + type=int, + default=0, + help="Rank of the current node for data parallel.", + ) + parser.add_argument( + "--dp-master-addr", + type=str, + default="", + help="Master node IP address for DP coordination.", + ) + parser.add_argument( + "--dp-master-port", + type=int, + default=0, + help="Master node port for DP coordination.", + ) parser.add_argument( "--timeout", type=int, @@ -132,26 +156,26 @@ if __name__ == "__main__": parser = create_parser() args = vars(parser.parse_args()) - # Extract DP-specific args + # Extract DP-specific args (pop to remove from engine_args) dp_size = args.pop("data_parallel_size") - nnodes = args.get("nnodes", 1) - node_rank = args.get("node_rank", 0) - master_addr = args.get("master_addr", "") - master_port = args.get("master_port", 0) + dp_num_nodes = args.pop("dp_num_nodes") + dp_node_rank = args.pop("dp_node_rank") + dp_master_addr = args.pop("dp_master_addr") + dp_master_port = args.pop("dp_master_port") timeout = args.pop("timeout") # Remaining args are engine args engine_args = args - if nnodes == 1: + if dp_num_nodes == 1: dp_master_ip = "127.0.0.1" - dp_master_port = get_open_port() + dp_master_port_val = get_open_port() else: - dp_master_ip = master_addr - dp_master_port = master_port + dp_master_ip = dp_master_addr + dp_master_port_val = dp_master_port - assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes" - dp_per_node = dp_size // nnodes + assert dp_size % dp_num_nodes == 0, "dp_size should be divisible by dp_num_nodes" + dp_per_node = dp_size // dp_num_nodes from multiprocessing import Process @@ -162,7 +186,7 @@ if __name__ == "__main__": procs = [] for local_dp_rank, global_dp_rank in enumerate( - range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node) + range(dp_node_rank * dp_per_node, (dp_node_rank + 1) * dp_per_node) ): proc = Process( target=main, @@ -171,7 +195,7 @@ if __name__ == "__main__": local_dp_rank, global_dp_rank, dp_master_ip, - dp_master_port, + dp_master_port_val, engine_args, ), ) From 097978a15dc3757e6fdbbae6ad752b97691581bd Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 22 Dec 2025 10:39:22 +0800 Subject: [PATCH 06/31] [Kernel] Enable fused_qknorm_rope_kernel supports partial rope (#30821) Signed-off-by: Jee Jee Li --- csrc/fused_qknorm_rope_kernel.cu | 109 ++++++++++-------- tests/kernels/core/test_fused_qk_norm_rope.py | 7 +- 2 files changed, 64 insertions(+), 52 deletions(-) diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu index baff8363162ef..5c23a90794594 100644 --- a/csrc/fused_qknorm_rope_kernel.cu +++ b/csrc/fused_qknorm_rope_kernel.cu @@ -107,7 +107,8 @@ __global__ void fusedQKNormRopeKernel( void const* k_weight_void, // RMSNorm weights for key void const* cos_sin_cache_void, // Pre-computed cos/sin cache int64_t const* position_ids, // Position IDs for RoPE - int const num_tokens // Number of tokens + int const num_tokens, // Number of tokens + int const rotary_dim // Dimension for RoPE ) { #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800) && !defined(USE_ROCM) if constexpr ((std::is_same_v) || @@ -227,56 +228,59 @@ __global__ void fusedQKNormRopeKernel( // Calculate cache pointer for this position - similar to // pos_encoding_kernels.cu - T_cache const* cache_ptr = cos_sin_cache + pos_id * head_dim; - int const embed_dim = head_dim / 2; + T_cache const* cache_ptr = cos_sin_cache + pos_id * rotary_dim; + int const embed_dim = rotary_dim / 2; T_cache const* cos_ptr = cache_ptr; T_cache const* sin_ptr = cache_ptr + embed_dim; - - if constexpr (interleave) { - // Perform interleaving. Use pre-computed cos/sin values. + int const rotary_lanes = rotary_dim / numElemsPerThread; // rotary range + if (laneId < rotary_lanes) { + if constexpr (interleave) { + // Perform interleaving. Use pre-computed cos/sin values. #pragma unroll - for (int i = 0; i < numElemsPerThread / 2; ++i) { - int const idx0 = 2 * i; - int const idx1 = 2 * i + 1; + for (int i = 0; i < numElemsPerThread / 2; ++i) { + int const idx0 = 2 * i; + int const idx1 = 2 * i + 1; + // Global dimension index in the head + int const dim_idx = laneId * numElemsPerThread + idx0; - float const val0 = elements[idx0]; - float const val1 = elements[idx1]; + float const val0 = elements[idx0]; + float const val1 = elements[idx1]; - int const dim_idx = laneId * numElemsPerThread + idx0; - int const half_dim = dim_idx / 2; - float const cos_val = - CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); - float const sin_val = - CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + int const half_dim = dim_idx / 2; + float const cos_val = + CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); + float const sin_val = + CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); - elements[idx0] = val0 * cos_val - val1 * sin_val; - elements[idx1] = val0 * sin_val + val1 * cos_val; - } - } else { - // Before data exchange with in warp, we need to sync. - __syncwarp(); - // Get the data from the other half of the warp. Use pre-computed cos/sin - // values. -#pragma unroll - for (int i = 0; i < numElemsPerThread; i++) { - elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], 16); - if (laneId < 16) { - elements2[i] = -elements2[i]; + elements[idx0] = val0 * cos_val - val1 * sin_val; + elements[idx1] = val0 * sin_val + val1 * cos_val; } + } else { + // Before data exchange with in warp, we need to sync. + __syncwarp(); + int pairOffset = (rotary_dim / 2) / numElemsPerThread; + // Get the data from the other half of the warp. Use pre-computed + // cos/sin values. +#pragma unroll + for (int i = 0; i < numElemsPerThread; i++) { + elements2[i] = __shfl_xor_sync(FINAL_MASK, elements[i], pairOffset); - int dim_idx = laneId * numElemsPerThread + i; - dim_idx = (dim_idx * 2) % head_dim; - int half_dim = dim_idx / 2; - // Use pre-computed cos/sin from cache - float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); - float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + if (laneId < pairOffset) { + elements2[i] = -elements2[i]; + } + int dim_idx = laneId * numElemsPerThread + i; - elements[i] = elements[i] * cos_val + elements2[i] * sin_val; + dim_idx = (dim_idx * 2) % rotary_dim; + int half_dim = dim_idx / 2; + float cos_val = CacheConverter::convert(VLLM_LDG(cos_ptr + half_dim)); + float sin_val = CacheConverter::convert(VLLM_LDG(sin_ptr + half_dim)); + + elements[i] = elements[i] * cos_val + elements2[i] * sin_val; + } + // __shfl_xor_sync does not provide memfence. Need to sync again. + __syncwarp(); } - // __shfl_xor_sync does not provide memfence. Need to sync again. - __syncwarp(); } - // Store. { vec_T vec; @@ -312,10 +316,10 @@ template void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_q, int const num_heads_k, int const num_heads_v, int const head_dim, - float const eps, void const* q_weight, - void const* k_weight, void const* cos_sin_cache, - bool const interleave, int64_t const* position_ids, - cudaStream_t stream) { + int const rotary_dim, float const eps, + void const* q_weight, void const* k_weight, + void const* cos_sin_cache, bool const interleave, + int64_t const* position_ids, cudaStream_t stream) { constexpr int blockSize = 256; int const warpsPerBlock = blockSize / 32; @@ -332,7 +336,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; case 128: @@ -340,7 +344,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; case 256: @@ -348,7 +352,7 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, fusedQKNormRopeKernel <<>>( qkv, num_heads_q, num_heads_k, num_heads_v, eps, q_weight, - k_weight, cos_sin_cache, position_ids, num_tokens); + k_weight, cos_sin_cache, position_ids, num_tokens, rotary_dim); }); break; default: @@ -392,12 +396,16 @@ void fused_qk_norm_rope( "Query weights size must match head dimension"); TORCH_CHECK(k_weight.size(0) == head_dim, "Key weights size must match head dimension"); - TORCH_CHECK(cos_sin_cache.size(1) == head_dim, - "Cos/sin cache dimension must match head_dim"); + + TORCH_CHECK(cos_sin_cache.size(1) % 2 == 0, "rotary_dim must be even"); + TORCH_CHECK(cos_sin_cache.size(1) <= head_dim, + "rotary_dim must be less than or equal to head_dim"); + TORCH_CHECK(qkv.scalar_type() == q_weight.scalar_type() && qkv.scalar_type() == k_weight.scalar_type(), "qkv, q_weight and k_weight must have the same dtype"); + int64_t rotary_dim = cos_sin_cache.size(1); int64_t num_tokens = qkv.size(0); TORCH_CHECK(position_ids.size(0) == num_tokens, "Number of tokens in position_ids must match QKV"); @@ -419,7 +427,8 @@ void fused_qk_norm_rope( qkv.data_ptr(), static_cast(num_tokens), static_cast(num_heads_q), static_cast(num_heads_k), static_cast(num_heads_v), static_cast(head_dim), - static_cast(eps), q_weight.data_ptr(), k_weight.data_ptr(), + static_cast(cos_sin_cache.size(1)), static_cast(eps), + q_weight.data_ptr(), k_weight.data_ptr(), cos_sin_cache.data_ptr(), !is_neox, reinterpret_cast(position_ids.data_ptr()), stream); diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py index a23959e353da9..05d61ec02fd29 100644 --- a/tests/kernels/core/test_fused_qk_norm_rope.py +++ b/tests/kernels/core/test_fused_qk_norm_rope.py @@ -13,6 +13,7 @@ DTYPES = [torch.bfloat16, torch.float16] IS_NEOX = [True, False] EPS_VALUES = [1e-5, 1e-6] SEEDS = [13] +PARTIAL_ROPE = [True, False] CUDA_DEVICES = ["cuda:0"] @@ -52,6 +53,7 @@ def _apply_qk_norm_rope( @pytest.mark.parametrize("is_neox", IS_NEOX) @pytest.mark.parametrize("eps", EPS_VALUES) @pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25]) @torch.inference_mode() def test_fused_qk_norm_rope_matches_reference( device: str, @@ -59,6 +61,7 @@ def test_fused_qk_norm_rope_matches_reference( is_neox: bool, eps: float, seed: int, + rotary_ratio: float, ): torch.set_default_device(device) current_platform.seed_everything(seed) @@ -76,10 +79,10 @@ def test_fused_qk_norm_rope_matches_reference( k_norm.weight.data.normal_(mean=1.0, std=0.1) q_weight = q_norm.weight.data k_weight = k_norm.weight.data - + rotary_dim = int(head_dim * rotary_ratio) rope = RotaryEmbedding( head_size=head_dim, - rotary_dim=head_dim, + rotary_dim=rotary_dim, max_position_embeddings=4096, base=10000.0, is_neox_style=is_neox, From 19cc9468fd0fa1701e7cb74b5928b329a1d16cf1 Mon Sep 17 00:00:00 2001 From: CedricHuang <38417461+CedricHwong@users.noreply.github.com> Date: Mon, 22 Dec 2025 11:34:49 +0800 Subject: [PATCH 07/31] [Feature]: Support NVIDIA ModelOpt HF FP8 variants FP8_PER_CHANNEL_PER_TOKEN and FP8_PB_WO in vLLM (#30957) --- docs/features/quantization/modelopt.md | 31 +++ tests/quantization/test_modelopt.py | 141 ++++++++++ vllm/config/model.py | 18 +- vllm/model_executor/layers/linear.py | 2 + .../layers/quantization/modelopt.py | 260 +++++++++++++++++- 5 files changed, 437 insertions(+), 15 deletions(-) diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index b02d5ba9e89a2..5c846767bc5b8 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -8,6 +8,16 @@ We recommend installing the library with: pip install nvidia-modelopt ``` +## Supported ModelOpt checkpoint formats + +vLLM detects ModelOpt checkpoints via `hf_quant_config.json` and supports the +following `quantization.quant_algo` values: + +- `FP8`: per-tensor weight scale (+ optional static activation scale). +- `FP8_PER_CHANNEL_PER_TOKEN`: per-channel weight scale and dynamic per-token activation quantization. +- `FP8_PB_WO` (ModelOpt may emit `fp8_pb_wo`): block-scaled FP8 weight-only (typically 128×128 blocks). +- `NVFP4`: ModelOpt NVFP4 checkpoints (use `quantization="modelopt_fp4"`). + ## Quantizing HuggingFace Models with PTQ You can quantize HuggingFace models using the example scripts provided in the Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory. @@ -80,3 +90,24 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll if __name__ == "__main__": main() ``` + +## Running the OpenAI-compatible server + +To serve a local ModelOpt checkpoint via the OpenAI-compatible API: + +```bash +vllm serve \ + --quantization modelopt \ + --host 0.0.0.0 --port 8000 +``` + +## Testing (local checkpoints) + +vLLM's ModelOpt unit tests are gated by local checkpoint paths and are skipped +by default in CI. To run the tests locally: + +```bash +export VLLM_TEST_MODELOPT_FP8_PC_PT_MODEL_PATH= +export VLLM_TEST_MODELOPT_FP8_PB_WO_MODEL_PATH= +pytest -q tests/quantization/test_modelopt.py +``` diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index 0298994c396f6..154b29d7017ac 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`. """ import os +from typing import NoReturn import pytest import torch @@ -19,6 +20,28 @@ def enable_pickle(monkeypatch): monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") +def _skip(msg: str) -> NoReturn: + pytest.skip(msg) + raise RuntimeError(msg) + + +def _snapshot_download_or_skip(model_id: str) -> str: + try: + from huggingface_hub import snapshot_download + except Exception as e: # pragma: no cover + _skip(f"huggingface_hub is required to download {model_id}: {e}") + + try: + return snapshot_download( + repo_id=model_id, + repo_type="model", + # These checkpoints are already small; download full repo for simplicity. + allow_patterns=["*"], + ) + except Exception as e: + _skip(f"Failed to download {model_id} from the HF Hub: {e}") + + @pytest.mark.skipif( not is_quant_method_supported("modelopt"), reason="ModelOpt FP8 is not supported on this GPU type.", @@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner): output = llm.generate_greedy(["Hello my name is"], max_tokens=4) assert output print(f"ModelOpt FP8 output: {output}") + + +@pytest.mark.skipif( + not is_quant_method_supported("modelopt"), + reason="ModelOpt FP8 is not supported on this GPU type.", +) +def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner): + """Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup.""" + model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt" + model_path = _snapshot_download_or_skip(model_id) + + with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8PcPtLinearMethod, + ) + + assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod) + assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod) + + assert qkv_proj.weight.dtype == torch.float8_e4m3fn + assert o_proj.weight.dtype == torch.float8_e4m3fn + assert gate_up_proj.weight.dtype == torch.float8_e4m3fn + assert down_proj.weight.dtype == torch.float8_e4m3fn + + # Per-channel scales; activations are dynamically scaled per token. + assert hasattr(qkv_proj, "weight_scale") + assert qkv_proj.weight_scale.dtype == torch.float32 + assert qkv_proj.weight_scale.dim() == 1 + assert not hasattr(qkv_proj, "input_scale") + + assert hasattr(o_proj, "weight_scale") + assert o_proj.weight_scale.dtype == torch.float32 + assert o_proj.weight_scale.dim() == 1 + assert not hasattr(o_proj, "input_scale") + + assert hasattr(gate_up_proj, "weight_scale") + assert gate_up_proj.weight_scale.dtype == torch.float32 + assert gate_up_proj.weight_scale.dim() == 1 + assert not hasattr(gate_up_proj, "input_scale") + + assert hasattr(down_proj, "weight_scale") + assert down_proj.weight_scale.dtype == torch.float32 + assert down_proj.weight_scale.dim() == 1 + assert not hasattr(down_proj, "input_scale") + + llm.apply_model(check_model) + + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) + assert output + print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}") + + +@pytest.mark.skipif( + not is_quant_method_supported("modelopt"), + reason="ModelOpt FP8 is not supported on this GPU type.", +) +def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner): + """Test ModelOpt FP8_PB_WO checkpoint setup.""" + model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo" + model_path = _snapshot_download_or_skip(model_id) + + with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8PbWoLinearMethod, + ) + + assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod) + assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod) + + assert qkv_proj.weight.dtype == torch.float8_e4m3fn + assert o_proj.weight.dtype == torch.float8_e4m3fn + assert gate_up_proj.weight.dtype == torch.float8_e4m3fn + assert down_proj.weight.dtype == torch.float8_e4m3fn + + # Block scales; should be materialized as a 2D [out_blk, in_blk] tensor. + assert hasattr(qkv_proj, "weight_scale") + assert qkv_proj.weight_scale.dtype == torch.float32 + assert qkv_proj.weight_scale.dim() == 2 + + assert hasattr(o_proj, "weight_scale") + assert o_proj.weight_scale.dtype == torch.float32 + assert o_proj.weight_scale.dim() == 2 + + assert hasattr(gate_up_proj, "weight_scale") + assert gate_up_proj.weight_scale.dtype == torch.float32 + assert gate_up_proj.weight_scale.dim() == 2 + + assert hasattr(down_proj, "weight_scale") + assert down_proj.weight_scale.dtype == torch.float32 + assert down_proj.weight_scale.dim() == 2 + + llm.apply_model(check_model) + + output = llm.generate_greedy(["Hello my name is"], max_tokens=4) + assert output + print(f"ModelOpt FP8_PB_WO output: {output}") diff --git a/vllm/config/model.py b/vllm/config/model.py index db5789b709372..c796e300ab155 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -843,12 +843,18 @@ class ModelConfig: producer_name = quant_cfg.get("producer", {}).get("name") if producer_name == "modelopt": quant_algo = quant_cfg.get("quantization", {}).get("quant_algo") - if quant_algo == "FP8": - quant_cfg["quant_method"] = "modelopt" - elif quant_algo == "NVFP4": - quant_cfg["quant_method"] = "modelopt_fp4" - elif quant_algo is not None: - raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") + if quant_algo is not None: + quant_algo_upper = str(quant_algo).upper() + if quant_algo_upper in { + "FP8", + "FP8_PER_CHANNEL_PER_TOKEN", + "FP8_PB_WO", + }: + quant_cfg["quant_method"] = "modelopt" + elif quant_algo_upper == "NVFP4": + quant_cfg["quant_method"] = "modelopt_fp4" + else: + raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") return quant_cfg diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 4ca4f75711ac7..402f0bf69ceaa 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -53,6 +53,8 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", + "ModelOptFp8PcPtLinearMethod", + "ModelOptFp8PbWoLinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", "HQQMarlinMethod", diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 54e8673fcfbb8..afbefe1fedc18 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -55,6 +55,9 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( select_cutlass_fp8_gemm_impl, swap_w13_to_w31, ) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + W8A8BlockFp8LinearOp, +) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( get_marlin_input_dtype, ) @@ -72,9 +75,15 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp, + cutlass_block_fp8_supported, requantize_with_max_scale, ) -from vllm.model_executor.parameter import ModelWeightParameter, PerTensorScaleParameter +from vllm.model_executor.parameter import ( + BlockQuantScaleParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, +) from vllm.scalar_type import scalar_types from vllm.utils.flashinfer import ( flashinfer_scaled_fp4_mm, @@ -88,7 +97,16 @@ if TYPE_CHECKING: logger = init_logger(__name__) -QUANT_ALGOS = ["FP8", "NVFP4"] +QUANT_ALGOS = [ + # FP8 (per-tensor weight + optional static activation scale). + "FP8", + # FP8 per-channel weight scale + per-token activation scale. + "FP8_PER_CHANNEL_PER_TOKEN", + # FP8 per-block weight-only (ModelOpt may emit this as lowercase). + "FP8_PB_WO", + # FP4 + "NVFP4", +] KV_CACHE_QUANT_ALGOS = ["FP8"] @@ -255,6 +273,9 @@ class ModelOptQuantConfigBase(QuantizationConfig): if not quant_method: raise ValueError("Missing 'quant_algo' in quantization config") + # Normalize quant_algo for robust matching (ModelOpt may emit lowercase). + quant_method = str(quant_method).upper() + if kv_cache_quant_method is None: # No KV cache quantization, keep this branch just to have this comment pass @@ -263,6 +284,8 @@ class ModelOptQuantConfigBase(QuantizationConfig): f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_method)}" ) + else: + kv_cache_quant_method = kv_cache_quant_method.upper() if not isinstance(exclude_modules, list): raise ValueError( @@ -302,17 +325,34 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): def __init__( self, + quant_method: str, is_checkpoint_fp8_serialized: bool, kv_cache_quant_method: str | None, exclude_modules: list[str], ) -> None: super().__init__(exclude_modules) + self.quant_method = quant_method self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized self.kv_cache_quant_method = kv_cache_quant_method if is_checkpoint_fp8_serialized: logger.warning( - "Detected ModelOpt fp8 checkpoint. Please note that" - " the format is experimental and could change." + "Detected ModelOpt fp8 checkpoint (quant_algo=%s). Please note " + "that the format is experimental and could change.", + quant_method, + ) + + # Select LinearMethod implementation based on quant_algo. + if self.quant_method == "FP8": + self.LinearMethodCls = ModelOptFp8LinearMethod + elif self.quant_method == "FP8_PER_CHANNEL_PER_TOKEN": + self.LinearMethodCls = ModelOptFp8PcPtLinearMethod + elif self.quant_method == "FP8_PB_WO": + self.LinearMethodCls = ModelOptFp8PbWoLinearMethod + else: + raise ValueError( + "Unsupported ModelOpt FP8 quant_algo for vLLM: " + f"{self.quant_method}. Supported: FP8 / " + "FP8_PER_CHANNEL_PER_TOKEN / FP8_PB_WO." ) def get_name(self) -> QuantizationMethods: @@ -346,13 +386,13 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): if "quantization" in hf_quant_cfg: quant_config = hf_quant_cfg["quantization"] if isinstance(quant_config, dict): - quant_algo = quant_config.get("quant_algo", "") - if "FP8" in quant_algo: + quant_algo = str(quant_config.get("quant_algo", "")) + if "FP8" in quant_algo.upper(): return "modelopt" else: # Check for compressed-tensors style config with specific quant_algo - quant_algo = hf_quant_cfg.get("quant_algo", "") - if isinstance(quant_algo, str) and "FP8" in quant_algo: + quant_algo = str(hf_quant_cfg.get("quant_algo", "")) + if "FP8" in quant_algo.upper(): return "modelopt" return None @@ -369,7 +409,12 @@ class ModelOptFp8Config(ModelOptQuantConfigBase): ) -> "ModelOptFp8Config": is_checkpoint_fp8_serialized = "FP8" in quant_method - return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules) + return cls( + quant_method, + is_checkpoint_fp8_serialized, + kv_cache_quant_method, + exclude_modules, + ) class ModelOptFp8LinearMethod(LinearMethodBase): @@ -464,6 +509,203 @@ class ModelOptFp8LinearMethod(LinearMethodBase): ) +class ModelOptFp8PcPtLinearMethod(LinearMethodBase): + """Linear method for ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoints. + + Expected checkpoint structure (per Linear): + - weight: fp8-e4m3fn, shape [out, in] + - weight_scale: fp32, shape [out] (per-output-channel) + - no input_scale (activations are dynamically quantized per-token) + """ + + def __init__(self, quant_config: ModelOptFp8Config) -> None: + self.quant_config = quant_config + self.fp8_linear = Fp8LinearOp( + act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + + if not self.quant_config.is_checkpoint_fp8_serialized: + raise ValueError( + "FP8_PER_CHANNEL_PER_TOKEN currently only supports " + "FP8-serialized checkpoints." + ) + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + weight_scale = ChannelQuantScaleParameter( + data=torch.empty(output_size_per_partition, dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, + ) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + def process_weights_after_loading(self, layer: Module) -> None: + layer.weight = Parameter(layer.weight.t(), requires_grad=False) + layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + bias=bias, + ) + + +class ModelOptFp8PbWoLinearMethod(LinearMethodBase): + """Linear method for ModelOpt FP8_PB_WO checkpoints. + + ModelOpt exports `weight_scale` as a 4D tensor: + [out_blk, 1, in_blk, 1] + where block size is typically 128 for both dims. + + vLLM executes it as FP8 GEMM with *dynamic per-token* activation quant. + """ + + _WEIGHT_BLOCK_SIZE: tuple[int, int] = (128, 128) + + def __init__(self, quant_config: ModelOptFp8Config) -> None: + self.quant_config = quant_config + block_n, block_k = self._WEIGHT_BLOCK_SIZE + self.weight_block_size = list(self._WEIGHT_BLOCK_SIZE) + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(block_n, block_k), + act_quant_group_shape=GroupShape(1, block_k), + cutlass_block_fp8_supported=cutlass_block_fp8_supported(), + use_aiter_and_is_supported=False, + ) + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + del input_size, output_size + + if not self.quant_config.is_checkpoint_fp8_serialized: + raise ValueError( + "FP8_PB_WO currently only supports FP8-serialized checkpoints." + ) + + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + + # Expose block size so the v2 weight loaders can translate offsets from + # element-space -> block-space for BlockQuantScaleParameter. + layer.weight_block_size = self.weight_block_size + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + block_n, block_k = self._WEIGHT_BLOCK_SIZE + if output_size_per_partition % block_n != 0: + raise ValueError( + "ModelOpt FP8_PB_WO requires out_features divisible by " + f"{block_n}, got {output_size_per_partition}." + ) + if input_size_per_partition % block_k != 0: + raise ValueError( + "ModelOpt FP8_PB_WO requires in_features divisible by " + f"{block_k}, got {input_size_per_partition}." + ) + + out_blks = output_size_per_partition // block_n + in_blks = input_size_per_partition // block_k + + # Match ModelOpt's exported shape so weight loading works without a + # custom loader: [out_blk, 1, in_blk, 1] + weight_scale = BlockQuantScaleParameter( + data=torch.empty((out_blks, 1, in_blks, 1), dtype=torch.float32), + input_dim=2, + output_dim=0, + weight_loader=weight_loader, + ) + weight_scale[:] = torch.finfo(torch.float32).min + layer.register_parameter("weight_scale", weight_scale) + + def process_weights_after_loading(self, layer: Module) -> None: + # Keep weight in [out, in] layout for W8A8BlockFp8LinearOp. + layer.weight = Parameter(layer.weight.data, requires_grad=False) + + scale = layer.weight_scale + if scale.dim() == 4: + # [out_blk, 1, in_blk, 1] -> [out_blk, in_blk] + scale = scale.squeeze(1).squeeze(-1) + elif scale.dim() != 2: + raise ValueError( + "Unexpected ModelOpt FP8_PB_WO weight_scale shape: " + f"{tuple(scale.shape)}." + ) + + layer.weight_scale = Parameter(scale.contiguous(), requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.w8a8_block_fp8_linear.apply( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + bias=bias, + ) + + class ModelOptFp8MoEMethod(FusedMoEMethodBase): """MoE method for ModelOpt FP8. Supports loading FP8 checkpoints with static weight scale and From 8c084de59d45a511aebe4005f49d9a81abceff3e Mon Sep 17 00:00:00 2001 From: Kevin McKay Date: Sun, 21 Dec 2025 23:13:14 -0600 Subject: [PATCH 08/31] [Misc] Fix spelling typos in comments (#31114) Signed-off-by: c0de128 --- .buildkite/scripts/generate-nightly-index.py | 2 +- tests/models/multimodal/processing/test_tensor_schema.py | 2 +- vllm/reasoning/mistral_reasoning_parser.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index d0965fbd56405..72f0bb2970289 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -372,7 +372,7 @@ if __name__ == "__main__": print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") - # keep only "official" files for a non-nightly version (specifed by cli args) + # keep only "official" files for a non-nightly version (specified by cli args) PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$") if PY_VERSION_RE.match(version): # upload-wheels.sh ensures no "dev" is in args.version diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index cb875436857cf..46fd4249ea4f8 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -138,7 +138,7 @@ def create_batched_mm_kwargs( ) -# TODO(Isotr0py): Don't initalize model during test +# TODO(Isotr0py): Don't initialize model during test @contextmanager def initialize_dummy_model( model_cls: type[nn.Module], diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index de3d1296ec734..48a36b4c6634c 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -104,7 +104,7 @@ class MistralReasoningParser(BaseThinkingReasoningParser): # 3. Both BOT and EOT have been outputted. elif has_bot_token and has_eot_token: return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :] - # 4. Only EOT has been outputted => this should not have occured for a model + # 4. Only EOT has been outputted => this should not have occurred for a model # well prompted and trained. else: return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :] From ec58c10ce1554eddca6c5c0eaa93ebeb4ed024d1 Mon Sep 17 00:00:00 2001 From: Kevin McKay Date: Sun, 21 Dec 2025 23:13:48 -0600 Subject: [PATCH 09/31] [Misc] Fix quantization-related typos (#31116) Signed-off-by: c0de128 --- tests/kernels/moe/modular_kernel_tools/common.py | 10 +++++----- tests/quantization/test_fp8.py | 2 +- vllm/utils/deep_gemm.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 6078ce44cee9f..537dcae4e74b4 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -258,16 +258,16 @@ class Config: f"{self.fe_supported_types()}." ) - # Check block quanization support - is_block_quatized = self.quant_block_shape is not None - if is_block_quatized and self.quant_dtype is None: + # Check block quantization support + is_block_quantized = self.quant_block_shape is not None + if is_block_quantized and self.quant_dtype is None: return False, "No block quantization support." - if is_block_quatized and not self.is_block_quant_supported(): + if is_block_quantized and not self.is_block_quant_supported(): return False, "Mismatched block quantization support." # deep_gemm only works with block-quantized - if self.needs_deep_gemm() and not is_block_quatized: + if self.needs_deep_gemm() and not is_block_quantized: return False, "Needs DeepGEMM but not block quantized." # Check dependencies (turn into asserts?) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index c9ab24fd439c5..a4b6d35987e13 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -217,7 +217,7 @@ def test_scaled_fp8_quant(dtype) -> None: ref_y, inv_scale = ops.scaled_fp8_quant(x, None) ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype) - # Reference dynamic quantizaton + # Reference dynamic quantization y = quantize_ref(x, inv_scale) torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype)) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index bcda46421e827..56c9ca361eaef 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -389,7 +389,7 @@ def should_use_deepgemm_for_fp8_linear( # Verify DeepGEMM N/K dims requirements # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul - # test inside kernels/quatization/test_block_fp8.py + # test inside kernels/quantization/test_block_fp8.py N_MULTIPLE = 64 K_MULTIPLE = 128 From 42b42824ae8223fa84e7545709572c1fd231436b Mon Sep 17 00:00:00 2001 From: Kevin McKay Date: Sun, 21 Dec 2025 23:14:02 -0600 Subject: [PATCH 10/31] [Misc] Fix grammar errors in comments and messages (#31115) Signed-off-by: c0de128 --- tests/quantization/test_compressed_tensors.py | 6 +++--- vllm/attention/ops/merge_attn_states.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 412b21328a325..535f028202275 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -83,7 +83,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") with vllm_runner(model_path, enforce_eager=True) as llm: @@ -161,7 +161,7 @@ def test_compressed_tensors_w8a8_logprobs( current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") if use_aiter: if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL: @@ -231,7 +231,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token( current_platform.is_rocm() and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL ): - pytest.skip(f"Skip model {model_path} as it is not support on ROCm.") + pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.") if use_aiter: if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL: diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py index 16106f3c93a6a..f347fb3fbba51 100644 --- a/vllm/attention/ops/merge_attn_states.py +++ b/vllm/attention/ops/merge_attn_states.py @@ -15,7 +15,7 @@ def merge_attn_states( output_lse: torch.Tensor | None = None, ) -> None: # NOTE(DefTruth): Currently, custom merge_attn_states CUDA kernel - # is not support for FP8 dtype, fallback to use Triton kernel. + # does not support FP8 dtype, fallback to use Triton kernel. def supported_dtypes(o: torch.Tensor) -> bool: return o.dtype in [torch.float32, torch.half, torch.bfloat16] From 14c3e6ade3605852282ec895397e6403b609847a Mon Sep 17 00:00:00 2001 From: Kevin McKay Date: Sun, 21 Dec 2025 23:14:14 -0600 Subject: [PATCH 11/31] [Misc] Fix spelling typos in model comments (#31117) Signed-off-by: c0de128 --- vllm/model_executor/models/config.py | 2 +- vllm/model_executor/models/qwen3_omni_moe_thinker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index a3624b1cfa5f2..ccac8a6066429 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -401,7 +401,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): # of attention tokens that would fit mamba_page_size: # e.g. for mamba page size = 788kB # attn_1_token = 2kB -> fits ~394 tokens - # then round up to a mulitple of 256 -> 512 tokens + # then round up to a multiple of 256 -> 512 tokens # End result: # attn_block_size = 512 # mamba_block_size = 512 (aligned to a multiple of chunk_size) diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 089129e443c01..5ca6b3d852ac3 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -323,7 +323,7 @@ class Qwen3Omni_VisionTransformer(nn.Module): hidden_size=self.hidden_size, ) - # vit pos embeding, TODO: spatial_patch_size vs patch_size + # vit pos embedding, TODO: spatial_patch_size vs patch_size if self.apply_vit_abs_pos_embed: self.pos_embed = nn.Embedding(self.num_grid_per_side**2, self.hidden_size) else: From 44ae85f725591edbfa3aa9e42184e68ebe56f504 Mon Sep 17 00:00:00 2001 From: Kevin McKay Date: Sun, 21 Dec 2025 23:14:27 -0600 Subject: [PATCH 12/31] [Misc] Fix typo: 'occured' -> 'occurred' (#31120) Signed-off-by: c0de128 From cf8eed7bef52293a9a6706a2953374b24dff2c81 Mon Sep 17 00:00:00 2001 From: Kevin McKay Date: Sun, 21 Dec 2025 23:14:58 -0600 Subject: [PATCH 13/31] [Bugfix][ROCm] Fix typo: is_linear_fp8_enaled -> is_linear_fp8_enabled (#31109) Signed-off-by: c0de128 Co-authored-by: Claude Sonnet 4.5 --- vllm/_aiter_ops.py | 2 +- .../compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py | 2 +- vllm/model_executor/layers/quantization/fp8.py | 2 +- vllm/model_executor/layers/quantization/input_quant_fp8.py | 2 +- vllm/platforms/rocm.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 0eae279acf5be..03e3bb7594910 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -761,7 +761,7 @@ class rocm_aiter_ops: @classmethod @if_aiter_supported - def is_linear_fp8_enaled(cls) -> bool: + def is_linear_fp8_enabled(cls) -> bool: return cls.is_linear_enabled() @classmethod diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index ee99572f5f499..758a54c10605a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -61,7 +61,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): ) self.cutlass_block_fp8_supported = cutlass_block_fp8_supported() - self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled() if self.weight_block_size is not None: assert not self.is_static_input_scheme diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 1f770d6d89a13..d19b20798ed06 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -414,7 +414,7 @@ class Fp8LinearMethod(LinearMethodBase): if vllm_is_batch_invariant(): self.use_marlin = False - self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter_and_is_supported = rocm_aiter_ops.is_linear_fp8_enabled() self.use_deep_gemm = is_deep_gemm_supported() self.weight_block_size = self.quant_config.weight_block_size diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index a5db086fb4729..7994c838ad548 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -51,7 +51,7 @@ class QuantFP8(CustomOp): self.column_major_scales = column_major_scales self.use_ue8m0 = use_ue8m0 - self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled() + self.use_aiter = rocm_aiter_ops.is_linear_fp8_enabled() self.is_group_quant = group_shape.is_per_group() if self.is_group_quant: diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 5892639eba406..b95287906c1fe 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -408,7 +408,7 @@ class RocmPlatform(Platform): parallel_config = vllm_config.parallel_config is_eager_execution = compilation_config == CUDAGraphMode.NONE use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() - use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled() + use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled() if compilation_config.cudagraph_mode.has_full_cudagraphs(): # decode context parallel does not support full cudagraphs From c02a2705f9ceeb00b5d32453621f997b2ceafbea Mon Sep 17 00:00:00 2001 From: Roger Young <42564206+rogeryoungh@users.noreply.github.com> Date: Mon, 22 Dec 2025 13:28:40 +0800 Subject: [PATCH 14/31] Update MiniMax-M2 ToolCall and add MiniMax-M2.1 in Docs (#31083) Signed-off-by: xuebi Co-authored-by: xuebi --- docs/models/supported_models.md | 2 +- vllm/tool_parsers/minimax_m2_tool_parser.py | 213 +++++++++++++++----- 2 files changed, 167 insertions(+), 48 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ffd0681dca7e8..90d4ff96c52f7 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -418,7 +418,7 @@ th { | `MiMoV2FlashForCausalLM` | MiMoV2Flash | `XiaomiMiMo/MiMo-V2-Flash`, etc. | ︎| ✅︎ | | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | -| `MiniMaxM2ForCausalLM` | MiniMax-M2 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | +| `MiniMaxM2ForCausalLM` | MiniMax-M2, MiniMax-M2.1 |`MiniMaxAI/MiniMax-M2`, etc. | | ✅︎ | | `MistralForCausalLM` | Ministral-3, Mistral, Mistral-Instruct | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | | `MistralLarge3ForCausalLM` | Mistral-Large-3-675B-Base-2512, Mistral-Large-3-675B-Instruct-2512 | `mistralai/Mistral-Large-3-675B-Base-2512`, `mistralai/Mistral-Large-3-675B-Instruct-2512`, etc. | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | diff --git a/vllm/tool_parsers/minimax_m2_tool_parser.py b/vllm/tool_parsers/minimax_m2_tool_parser.py index a1ab75f548bfc..67bd0e61620da 100644 --- a/vllm/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/tool_parsers/minimax_m2_tool_parser.py @@ -138,37 +138,167 @@ class MinimaxM2ToolParser(ToolParser): return name_str def _convert_param_value(self, value: str, param_type: str) -> Any: - """Convert parameter value to the correct type.""" + """Convert parameter value to the correct type (legacy single-type version).""" + return self._convert_param_value_with_types(value, [param_type]) + + def _extract_types_from_schema(self, schema: Any) -> list[str]: + """ + Extract all possible types from a JSON schema definition. + Handles anyOf, oneOf, allOf, type arrays, and enum fields. + + Args: + schema: The JSON schema definition for a parameter + + Returns: + List of type strings (e.g., ["string", "integer", "null"]) + """ + if schema is None: + return ["string"] + + if not isinstance(schema, dict): + return ["string"] + + types: set[str] = set() + + # Handle direct "type" field + if "type" in schema: + type_value = schema["type"] + if isinstance(type_value, str): + types.add(type_value) + elif isinstance(type_value, list): + for t in type_value: + if isinstance(t, str): + types.add(t) + + # Handle enum - infer types from enum values + if "enum" in schema and isinstance(schema["enum"], list) and schema["enum"]: + for value in schema["enum"]: + if value is None: + types.add("null") + elif isinstance(value, bool): + types.add("boolean") + elif isinstance(value, int): + types.add("integer") + elif isinstance(value, float): + types.add("number") + elif isinstance(value, str): + types.add("string") + elif isinstance(value, list): + types.add("array") + elif isinstance(value, dict): + types.add("object") + + # Handle anyOf, oneOf, allOf - recursively extract types + for choice_field in ("anyOf", "oneOf", "allOf"): + if choice_field in schema and isinstance(schema[choice_field], list): + for choice in schema[choice_field]: + extracted = self._extract_types_from_schema(choice) + types.update(extracted) + + # If no types found, default to string + if not types: + return ["string"] + + return list(types) + + def _convert_param_value_with_types( + self, value: str, param_types: list[str] + ) -> Any: + """ + Convert parameter value to the correct type based on a list of possible types. + Tries each type in order until one succeeds. + + Args: + value: The string value to convert + param_types: List of possible type strings + + Returns: + The converted value + """ if value.lower() == "null": return None - param_type = param_type.lower() - if param_type in ["string", "str", "text"]: + # Normalize types + normalized_types = [t.lower() for t in param_types] + + # Try null first if it's in the list + if "null" in normalized_types or value.lower() in ("null", "none", "nil"): + return None + + # Try each type in order of preference (most specific first, string as fallback) + # Priority: integer > number > boolean > object > array > string + type_priority = [ + "integer", + "int", + "number", + "float", + "boolean", + "bool", + "object", + "array", + "string", + "str", + "text", + ] + + for param_type in type_priority: + if param_type not in normalized_types: + continue + + if param_type in ["string", "str", "text"]: + return value + elif param_type in ["integer", "int"]: + try: + return int(value) + except (ValueError, TypeError): + continue + elif param_type in ["number", "float"]: + try: + val = float(value) + return val if val != int(val) else int(val) + except (ValueError, TypeError): + continue + elif param_type in ["boolean", "bool"]: + lower_val = value.lower().strip() + if lower_val in ["true", "1", "yes", "on"]: + return True + elif lower_val in ["false", "0", "no", "off"]: + return False + continue + elif param_type in ["object", "array"]: + try: + return json.loads(value) + except json.JSONDecodeError: + continue + + # Fallback: try JSON parse, then return as string + try: + return json.loads(value) + except json.JSONDecodeError: return value - elif param_type in ["integer", "int"]: - try: - return int(value) - except (ValueError, TypeError): - return value - elif param_type in ["number", "float"]: - try: - val = float(value) - return val if val != int(val) else int(val) - except (ValueError, TypeError): - return value - elif param_type in ["boolean", "bool"]: - return value.lower() in ["true", "1"] - elif param_type in ["object", "array"]: - try: - return json.loads(value) - except json.JSONDecodeError: - return value - else: - # Try JSON parse first, fallback to string - try: - return json.loads(value) - except json.JSONDecodeError: - return value + + def _get_param_types_from_config( + self, param_name: str, param_config: dict + ) -> list[str]: + """ + Get parameter types from parameter configuration. + Handles anyOf, oneOf, allOf, and direct type definitions. + + Args: + param_name: The name of the parameter + param_config: The properties dict from the tool schema + + Returns: + List of type strings + """ + if param_name not in param_config: + return ["string"] + + param_schema = param_config[param_name] + if not isinstance(param_schema, dict): + return ["string"] + + return self._extract_types_from_schema(param_schema) def _parse_single_invoke( self, invoke_str: str, tools: list | None @@ -207,17 +337,11 @@ class MinimaxM2ToolParser(ToolParser): if param_value.endswith("\n"): param_value = param_value[:-1] - # Get parameter type - param_type = "string" - if ( - param_name in param_config - and isinstance(param_config[param_name], dict) - and "type" in param_config[param_name] - ): - param_type = param_config[param_name]["type"] + # Get parameter types (supports anyOf/oneOf/allOf) + param_type = self._get_param_types_from_config(param_name, param_config) # Convert value - param_dict[param_name] = self._convert_param_value( + param_dict[param_name] = self._convert_param_value_with_types( param_value, param_type ) @@ -593,7 +717,7 @@ class MinimaxM2ToolParser(ToolParser): # Store raw value for later processing self.accumulated_params[self.current_param_name] = param_value - # Get parameter configuration for type conversion + # Get parameter configuration with anyOf support param_config = {} if self.streaming_request and self.streaming_request.tools: for tool in self.streaming_request.tools: @@ -610,17 +734,12 @@ class MinimaxM2ToolParser(ToolParser): param_config = params["properties"] break - # Get parameter type - param_type = "string" - if ( - self.current_param_name in param_config - and isinstance(param_config[self.current_param_name], dict) - and "type" in param_config[self.current_param_name] - ): - param_type = param_config[self.current_param_name]["type"] + # Get parameter types (supports anyOf/oneOf/allOf) + param_type = self._get_param_types_from_config( + self.current_param_name, param_config + ) - # Convert param value to appropriate type - converted_value = self._convert_param_value( + converted_value = self._convert_param_value_with_types( param_value, param_type ) From 256a33ecb4923a4bfa6e1a9adbc69f5255a5aa6d Mon Sep 17 00:00:00 2001 From: Li Wang Date: Mon, 22 Dec 2025 18:15:54 +0800 Subject: [PATCH 15/31] [Model] Fix bagel failed to run (#31132) Signed-off-by: wangli --- vllm/model_executor/models/bagel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py index 98229c6d4ca1b..cf45fb9fe8370 100644 --- a/vllm/model_executor/models/bagel.py +++ b/vllm/model_executor/models/bagel.py @@ -487,7 +487,7 @@ class BagelForConditionalGeneration( # Split by image return tuple(vision_embeds) - def get_multimodal_embeddings(self, **kwargs: object) -> MultiModalEmbeddings: + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: """Get multimodal embeddings from input.""" image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: From bd6d5a7475aa26a85de5d63cfcae9a1151d7652f Mon Sep 17 00:00:00 2001 From: AlonKejzman Date: Mon, 22 Dec 2025 14:56:06 +0200 Subject: [PATCH 16/31] [gpt-oss] Fix harmony parser in streaming responses (#30205) Signed-off-by: AlonKejzman --- vllm/entrypoints/openai/serving_chat.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 88d87a3334955..422a8c18e8e98 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -811,6 +811,11 @@ class OpenAIServingChat(OpenAIServing): delta_text += harmony_parser.last_content_delta or "" cur_channel = harmony_parser.current_channel cur_recipient = harmony_parser.current_recipient + # handle the case where several tokens where generated at once + # including the final token, leading to a delta in the text + # but the current channel to be empty (start state) + if not cur_channel and delta_text: + cur_channel = "final" else: delta_text = output.text From 2cf91c2ea48f27572b9fdcdd8d8802c7f1059e8b Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Mon, 22 Dec 2025 21:24:21 +0800 Subject: [PATCH 17/31] [CI] add polling for precompiled wheel in python_only_compile.sh, fix index generation for releases (#30781) Signed-off-by: Shengqi Chen --- .buildkite/scripts/generate-nightly-index.py | 12 ++++++-- .buildkite/scripts/upload-wheels.sh | 3 +- tests/standalone_tests/python_only_compile.sh | 28 +++++++++++++------ 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index 72f0bb2970289..1794df9479e55 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -291,6 +291,7 @@ if __name__ == "__main__": """ Arguments: --version : version string for the current build (e.g., commit hash) + --wheel-dir : directory containing wheel files (default to be same as `version`) --current-objects : path to JSON file containing current S3 objects listing in this version directory --output-dir : directory to store generated index files --alias-to-default : (optional) alias variant name for the default variant @@ -318,6 +319,12 @@ if __name__ == "__main__": required=True, help="Directory to store generated index files", ) + parser.add_argument( + "--wheel-dir", + type=str, + default=None, + help="Directory containing wheel files (default to be same as `version`)", + ) parser.add_argument( "--alias-to-default", type=str, @@ -384,9 +391,10 @@ if __name__ == "__main__": print("Nightly version detected, keeping all wheel files.") # Generate index and metadata, assuming wheels and indices are stored as: - # s3://vllm-wheels/{version}/ + # s3://vllm-wheels/{wheel_dir}/ # s3://vllm-wheels// - wheel_base_dir = Path(output_dir).parent / version + wheel_dir = args.wheel_dir or version + wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/") index_base_dir = Path(output_dir) generate_index_and_metadata( diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 3a218a4bb2e6d..1af7f476ae74b 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -102,6 +102,7 @@ if [[ "$version" != *"dev"* ]]; then echo "Re-generating indices for /$pure_version/" rm -rf "$INDICES_OUTPUT_DIR/*" mkdir -p "$INDICES_OUTPUT_DIR" - $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg + # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path + $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --wheel-dir "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" fi diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh index 2017e34030d60..ebf199a5056fb 100644 --- a/tests/standalone_tests/python_only_compile.sh +++ b/tests/standalone_tests/python_only_compile.sh @@ -18,25 +18,37 @@ for i in {1..5}; do echo "Checking metadata.json URL (attempt $i)..." if curl --fail "$meta_json_url" > metadata.json; then echo "INFO: metadata.json URL is valid." - # check whether it is valid json by python + # check whether it is valid json by python (printed to stdout) if python3 -m json.tool metadata.json; then - echo "INFO: metadata.json is valid JSON. Proceeding with the test." + echo "INFO: metadata.json is valid JSON. Proceeding with the check." + # check whether there is an object in the json matching: + # "package_name": "vllm", and "platform_tag" matches the current architecture + # see `determine_wheel_url` in setup.py for more details + if python3 -c "import platform as p,json as j,sys as s; d = j.load(open('metadata.json')); \ + s.exit(int(not any(o.get('package_name') == 'vllm' and p.machine() in o.get('platform_tag') \ + for o in d)))" 2>/dev/null; then + echo "INFO: metadata.json contains a pre-compiled wheel for the current architecture." + break + else + echo "WARN: metadata.json does not have a pre-compiled wheel for the current architecture." + fi else echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!" + echo "INFO: metadata.json content:" + cat metadata.json exit 1 fi - break fi - # failure handling + # failure handling & retry logic if [ $i -eq 5 ]; then - echo "ERROR: metadata.json URL is still not valid after 5 attempts." - echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists." + echo "ERROR: metadata is still not available after 5 attempts." + echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit is available." echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes." echo " NOTE: If it fails, please report in #sig-ci channel." exit 1 else - echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..." - sleep 180 + echo "WARNING: metadata is not available. Retrying after 5 minutes..." + sleep 300 fi done From 8f8f469b1b8c4ef44a0a8a5f0bc6ddf86acbc9db Mon Sep 17 00:00:00 2001 From: dengyunyang <584797741@qq.com> Date: Mon, 22 Dec 2025 21:25:59 +0800 Subject: [PATCH 18/31] [BugFix] skip language model in Encoder (#30242) Signed-off-by: dengyunyang <584797741@qq.com> --- .../disaggregated_encoder/README.md | 2 + vllm/config/model.py | 2 +- vllm/model_executor/model_loader/utils.py | 9 ++- vllm/model_executor/models/adapters.py | 61 +++++++++++++++++++ vllm/model_executor/models/interfaces.py | 12 ++++ vllm/model_executor/models/qwen2_5_vl.py | 10 ++- vllm/model_executor/models/qwen3_vl.py | 8 +++ vllm/v1/worker/gpu_model_runner.py | 15 +++++ 8 files changed, 116 insertions(+), 3 deletions(-) diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md index b2c3bb974dfab..2a59f86d15fb7 100644 --- a/examples/online_serving/disaggregated_encoder/README.md +++ b/examples/online_serving/disaggregated_encoder/README.md @@ -38,6 +38,8 @@ Encoder engines should be launched with the following flags: - `--max-num-batched-tokens=` **(default: 2048)** – This flag controls the token scheduling budget per decoding step and is irrelevant to encoder-only instances. **Set it to a very high value (effectively unlimited) to bypass scheduler limitations.** The actual token budget is managed by the encoder cache manager. +- `--convert "mm_encoder_only"` **(Optional)** - The language model is skipped during initialization to reduce device memory usage. **Models using this option must implement the `get_language_model_spec` interface.** + ## Local media inputs To support local image inputs (from your ```MEDIA_PATH``` directory), add the following flag to the encoder instance: diff --git a/vllm/config/model.py b/vllm/config/model.py index c796e300ab155..dd2b7b9d7a786 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -71,7 +71,7 @@ else: logger = init_logger(__name__) RunnerOption = Literal["auto", RunnerType] -ConvertType = Literal["none", "embed", "classify", "reward"] +ConvertType = Literal["none", "embed", "classify", "reward", "mm_encoder_only"] ConvertOption = Literal["auto", ConvertType] TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 74b02e4c62583..08d7a851ac9ab 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -189,7 +189,9 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], ) convert_type = model_config.convert_type - if convert_type != "none" and supports_multimodal(model_cls): + if convert_type not in ["none", "mm_encoder_only"] and supports_multimodal( + model_cls + ): logger.debug_once("Detected conversion of Multi Modal model.") converted = try_create_mm_pooling_model_cls(model_cls) if converted is not None: @@ -200,6 +202,11 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], if convert_type == "none": pass + elif convert_type == "mm_encoder_only": + logger.debug_once("Converting to mm encoder only model.") + from vllm.model_executor.models.adapters import as_mm_encoder_only_model + + model_cls = as_mm_encoder_only_model(model_cls) elif convert_type == "embed": logger.debug_once("Converting to embedding model.") model_cls = as_embedding_model(model_cls) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 504de9fe10871..acf1e57a59a97 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -520,3 +520,64 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]): method = getattr(text_config, "method", None) assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported" return SEQ_CLS_LOAD_METHODS[method](model, weights) + + +def as_mm_encoder_only_model(cls: _T) -> _T: + """ + Subclass an existing vLLM vl model to support mm encoder only for + EPD encoder instances. + """ + if not hasattr(cls, "embed_multimodal"): + # Submodel case: return the original class. + return cls + + if not hasattr(cls, "get_language_model_spec"): + raise TypeError(f"{cls} need to implement `get_language_model_spec` method.") + + lm_model_cls, lm_attr = cls.get_language_model_spec() + + if lm_model_cls is None or lm_attr is None: + raise TypeError( + f"{cls}.get_language_model_spec() must return (lm_model_cls, lm_attr)" + ) + + class DummyLM(nn.Module): + def __init__(self, *args, **kwargs): + self.make_empty_intermediate_tensors = None + + class ModelForMMEncoderOnly(cls): + def __init__( + self, + *, + vllm_config: "VllmConfig", + prefix: str = "", + **kwargs: Any, + ) -> None: + self.is_mm_encoder_only_model = True + origin_init = lm_model_cls.__init__ + try: + lm_model_cls.__init__ = DummyLM.__init__ + super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) + + if hasattr(self, lm_attr): + delattr(self, lm_attr) + finally: + lm_model_cls.__init__ = origin_init + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + from .utils import AutoWeightsLoader + + origin_init_ = AutoWeightsLoader.__init__ + + def _new_init_(self, *args, **kwargs): + origin_init_(self, *args, **kwargs) + self.skip_prefixes = (self.skip_prefixes or []) + [f"{lm_attr}."] + + try: + AutoWeightsLoader.__init__ = _new_init_ + result = super().load_weights(weights) + finally: + AutoWeightsLoader.__init__ = origin_init_ + return result + + return ModelForMMEncoderOnly # type: ignore diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index cb99d57e8b8c7..67c65a44dcf7f 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -141,6 +141,14 @@ class SupportsMultiModal(Protocol): """ ... + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return None, None + @overload def embed_input_ids(self, input_ids: Tensor) -> Tensor: ... @@ -302,6 +310,10 @@ def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool: return getattr(model, "supports_encoder_tp_data", False) +def supports_mm_encoder_only(model: type[object] | object) -> bool: + return getattr(model, "is_mm_encoder_only_model", False) + + @overload def supports_multimodal_pruning( model: type[object], diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index b730ac0315893..0b44ff622f05b 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -34,7 +34,7 @@ import einops import torch import torch.nn as nn import torch.nn.functional as F -from transformers import BatchFeature +from transformers import BatchFeature, Qwen2ForCausalLM from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( Qwen2_5_VLConfig, @@ -1567,3 +1567,11 @@ class Qwen2_5_VLForConditionalGeneration( connector="visual.merger.", tower_model="visual.", ) + + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return Qwen2ForCausalLM, "language_model" diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 4838f68e06f70..fea73557f1e82 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -2090,3 +2090,11 @@ class Qwen3VLForConditionalGeneration( connector="visual.merger", tower_model="visual.", ) + + @classmethod + def get_language_model_spec(cls) -> tuple[nn.Module | None, str | None]: + """ + Return the language model spec: + (language model class, language model attr) + """ + return Qwen3LLMForCausalLM, "language_model" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 92822d829a881..0a17923e89989 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -66,6 +66,7 @@ from vllm.model_executor.models.interfaces import ( SupportsXDRoPE, is_mixture_of_experts, supports_eagle3, + supports_mm_encoder_only, supports_mrope, supports_multimodal_pruning, supports_transcription, @@ -4067,6 +4068,11 @@ class GPUModelRunner( remove_lora: If False, dummy LoRAs are not destroyed after the run activate_lora: If False, dummy_run is performed without LoRAs. """ + if supports_mm_encoder_only(self.model): + # The current dummy run only covers LM execution, so we can skip it. + # mm encoder dummy run may need to add in the future. + return torch.tensor([]), torch.tensor([]) + assert ( cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() @@ -4344,6 +4350,11 @@ class GPUModelRunner( # The dummy hidden states may contain special values, # like `inf` or `nan`. # To avoid breaking the sampler, we use a random tensor here instead. + + if supports_mm_encoder_only(self.model): + # MM Encoder only model no need to run sampler. + return torch.tensor([]) + hidden_states = torch.rand_like(hidden_states) logits = self.model.compute_logits(hidden_states) @@ -4472,6 +4483,10 @@ class GPUModelRunner( self, hidden_states: torch.Tensor, ) -> PoolerOutput: + if supports_mm_encoder_only(self.model): + # MM Encoder only model not need to run pooler. + return torch.tensor([]) + # Find the task that has the largest output for subsequent steps supported_pooling_tasks = self.get_supported_pooling_tasks() From b1c3f96ae3874cc3b5a6bc7e55f43339b81da183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Mon, 22 Dec 2025 16:21:40 +0100 Subject: [PATCH 19/31] [CI][Bugfix] Fix `entrypoints/openai/test_audio.py` (#31151) Signed-off-by: NickLucche --- tests/entrypoints/openai/test_audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 4cf864bdb2de9..af5f2fec402ed 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -313,7 +313,7 @@ async def test_chat_streaming_input_audio( "format": "wav", }, }, - {"type": "text", "text": "What's happening in this audio?"}, + {"type": "text", "text": "What's a short title for this audio?"}, ], } ] From a5bc77c253a6a3a7f2df6936ac7c9f233030df09 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Mon, 22 Dec 2025 09:41:56 -0600 Subject: [PATCH 20/31] [AMD][CI] Add "V1 Test e2e + engine" to mi325_8 Agent Pool (#31040) Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 0d7194f003840..a4d89a46b01ac 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -349,7 +349,9 @@ steps: - label: V1 Test e2e + engine # 65min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_4 + # The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. + # See discussion here: https://github.com/vllm-project/vllm/pull/31040 + agent_pool: mi325_8 # grade: Blocking source_file_dependencies: - vllm/ From 022f3cea5327cc720a325c50931e1edcfdf2d32b Mon Sep 17 00:00:00 2001 From: TJian Date: Tue, 23 Dec 2025 01:28:22 +0900 Subject: [PATCH 21/31] [ROCm] [Critical]: Remove unused variable (#31156) Signed-off-by: tjtanaa --- csrc/fused_qknorm_rope_kernel.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu index 5c23a90794594..a51e1a347e1d4 100644 --- a/csrc/fused_qknorm_rope_kernel.cu +++ b/csrc/fused_qknorm_rope_kernel.cu @@ -405,7 +405,6 @@ void fused_qk_norm_rope( qkv.scalar_type() == k_weight.scalar_type(), "qkv, q_weight and k_weight must have the same dtype"); - int64_t rotary_dim = cos_sin_cache.size(1); int64_t num_tokens = qkv.size(0); TORCH_CHECK(position_ids.size(0) == num_tokens, "Number of tokens in position_ids must match QKV"); From 8dd0db687b40a2fe7827975d93a296c9070aaba1 Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Mon, 22 Dec 2025 08:45:59 -0800 Subject: [PATCH 22/31] [UX] improve profiler error message (#31125) Signed-off-by: Boyuan Feng --- vllm/v1/worker/gpu_worker.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index bc71351d2cc55..4747388e22b3d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -634,7 +634,12 @@ class Worker(WorkerBase): def profile(self, is_start: bool = True): if self.profiler is None: - raise RuntimeError("Profiling is not enabled.") + raise RuntimeError( + "Profiling is not enabled. Please set --profiler-config to enable " + "profiling. Example: " + "'--profiler-config.profiler=torch --profiler-config.torch_profiler_dir" + "=YOUR_DIR_PATH_TO_DUMP_TRACE'" + ) if is_start: self.profiler.start() else: From ab3a85fd688dc2544edab3098fab8e2c8893a707 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 22 Dec 2025 11:19:27 -0600 Subject: [PATCH 23/31] [ROCm][CI/Build] Fix triton version to one that has triton_kernels required for gpt-oss to run (#31159) Signed-off-by: Gregory Shtrasberg --- docker/Dockerfile.rocm_base | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index ac63231094462..c5e94ee1f6928 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,5 +1,5 @@ ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete -ARG TRITON_BRANCH="a272dfa8" +ARG TRITON_BRANCH="57c693b6" ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG PYTORCH_BRANCH="89075173" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" @@ -162,4 +162,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ - && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt \ No newline at end of file + && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt From 7b926e890195f026b60f36506503a80afc583b33 Mon Sep 17 00:00:00 2001 From: Yongye Zhu Date: Mon, 22 Dec 2025 09:34:19 -0800 Subject: [PATCH 24/31] [MoE Refactor][9/N] Use modular kernel for unquantized Triton MoE (#31052) Signed-off-by: Yongye Zhu --- tests/kernels/moe/test_moe.py | 7 ++++++ .../fused_moe/unquantized_fused_moe_method.py | 22 +++++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index ce99d9691fdc8..fd6ce6bfbd782 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -60,6 +60,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w from vllm.model_executor.models.mixtral import MixtralMoE from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +from vllm.v1.worker.workspace import init_workspace_manager NUM_EXPERTS = [8, 64, 192] EP_SIZE = [1, 4] @@ -487,6 +488,7 @@ def test_mixtral_moe( monkeypatch.setenv("MASTER_ADDR", "localhost") monkeypatch.setenv("MASTER_PORT", "12345") init_distributed_environment() + init_workspace_manager(torch.cuda.current_device()) # Instantiate our and huggingface's MoE blocks vllm_config.compilation_config.static_forward_context = dict() @@ -533,6 +535,11 @@ def test_mixtral_moe( torch.cuda.synchronize() torch.cuda.empty_cache() + # FIXME (zyongye) fix this after we move self.kernel + # assignment in FusedMoE.__init__ + + vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts) + # Run forward passes for both MoE blocks hf_states, _ = hf_moe.forward(hf_inputs) vllm_states = vllm_moe.forward(vllm_inputs) diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 1ee7b65b22e3f..82dbccf3fa9da 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -6,6 +6,7 @@ import torch import torch.nn.functional as F import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._aiter_ops import rocm_aiter_ops from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp @@ -23,6 +24,9 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, ) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum @@ -30,9 +34,9 @@ from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe if current_platform.is_cuda_alike(): from .fused_batched_moe import BatchedTritonExperts - from .fused_moe import TritonExperts, fused_experts + from .fused_moe import TritonExperts else: - fused_experts = None # type: ignore + TritonExperts = None # type: ignore if current_platform.is_tpu(): from .moe_pallas import fused_moe as fused_moe_pallas @@ -265,6 +269,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) else: layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) + elif current_platform.is_cuda_alike(): + self.moe_quant_config = self.get_fused_moe_quant_config(layer) + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonExperts(self.moe_quant_config), + shared_experts=None, + ) def apply( self, @@ -278,9 +289,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): router_logits=router_logits, ) - def get_fused_moe_quant_config( - self, layer: torch.nn.Module - ) -> FusedMoEQuantConfig | None: + def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: if self.moe.has_bias: return biased_moe_quant_config( layer.w13_bias, @@ -322,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: - result = fused_experts( + result = self.kernel( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, @@ -330,7 +339,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): topk_ids=topk_ids, inplace=True, activation=layer.activation, - quant_config=self.moe_quant_config, apply_router_weight_on_input=layer.apply_router_weight_on_input, global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, From b10f41c894bc55c8dd8234a40edd1971be762805 Mon Sep 17 00:00:00 2001 From: Pavani Majety Date: Mon, 22 Dec 2025 11:15:57 -0800 Subject: [PATCH 25/31] [SM100] Enable fp8 compute for prefill MLA (#30746) Signed-off-by: Pavani Majety --- tests/v1/attention/test_mla_backends.py | 7 +- .../quantization/utils/flashinfer_fp4_moe.py | 1 - vllm/v1/attention/backends/mla/common.py | 127 ++++++++++++++++-- 3 files changed, 117 insertions(+), 18 deletions(-) diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 783e02ce89bdb..6a62440d95417 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -27,7 +27,7 @@ from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.attention.backends.mla.common import QueryLenSupport from vllm.v1.attention.backends.utils import CommonAttentionMetadata -from vllm.v1.kv_cache_interface import FullAttentionSpec +from vllm.v1.kv_cache_interface import MLAAttentionSpec BACKENDS_TO_TEST = [ AttentionBackendEnum.CUTLASS_MLA, @@ -289,7 +289,7 @@ class MockMLAAttentionLayer(AttentionLayerBase): def run_attention_backend( backend: AttentionBackendEnum, - kv_cache_spec: FullAttentionSpec, + kv_cache_spec: MLAAttentionSpec, layer_names: list[str], vllm_config, device: torch.device, @@ -740,7 +740,7 @@ def test_backend_correctness( kv_cache = kv_cache_per_block_size[block_size] # Create kv_cache_spec with the correct block_size for this backend - backend_kv_cache_spec = FullAttentionSpec( + backend_kv_cache_spec = MLAAttentionSpec( block_size=block_size, num_kv_heads=vllm_config.model_config.get_num_kv_heads( vllm_config.parallel_config @@ -748,6 +748,7 @@ def test_backend_correctness( head_size=vllm_config.model_config.get_head_size(), dtype=vllm_config.model_config.dtype, sliding_window=vllm_config.model_config.get_sliding_window(), + cache_dtype_str=vllm_config.cache_config.cache_dtype, ) backend_output = run_attention_backend( diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 1d410316d6299..16598d567848f 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -325,7 +325,6 @@ def flashinfer_trtllm_fp4_moe( local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, routed_scaling_factor=None, - tile_tokens_dim=None, routing_method_type=routing_method_type, do_finalize=True, )[0] diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index fea482493635f..0ff951213040a 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -355,6 +355,7 @@ class MLACommonPrefillMetadata: max_query_len: int chunked_context: ChunkedContextMetadata | None = None query_seq_lens: torch.Tensor | None = None + q_data_type: torch.dtype | None = None @dataclass @@ -539,6 +540,11 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): metadata_cls if metadata_cls is not None else MLACommonMetadata ) self.kv_cache_spec = kv_cache_spec + self.q_data_type = ( + current_platform.fp8_dtype() + if (kv_cache_spec is not None and "fp8" in kv_cache_spec.cache_dtype_str) + else vllm_config.model_config.dtype + ) scheduler_config = vllm_config.scheduler_config self.model_config = vllm_config.model_config parallel_config = vllm_config.parallel_config @@ -681,7 +687,6 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): # For main run, qo_indptr == kv_indptr kv_indptr = qo_indptr.clone() - # Prepare main prefill self._fi_prefill_main.plan( qo_indptr=qo_indptr, @@ -694,7 +699,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): sm_scale=self._global_hyperparameters.sm_scale, window_left=self._global_hyperparameters.window_left, logits_soft_cap=self._global_hyperparameters.logits_soft_cap, - q_data_type=self.model_config.dtype, + q_data_type=self.q_data_type, ) # Prepare context prefills @@ -713,7 +718,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): sm_scale=self._global_hyperparameters.sm_scale, window_left=self._global_hyperparameters.window_left, logits_soft_cap=self._global_hyperparameters.logits_soft_cap, - q_data_type=self.model_config.dtype, + q_data_type=self.q_data_type, ) prefill.prefill_main = self._fi_prefill_main @@ -970,6 +975,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): query_start_loc=prefill_query_start_loc, max_query_len=max_query_len, chunked_context=chunked_context_metadata, + q_data_type=self.q_data_type, ) if self._use_cudnn_prefill: @@ -1370,8 +1376,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): return attn_out def _run_prefill_new_tokens_fa( - self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse + self, + prefill: MLACommonPrefillMetadata, + q, + k, + v, + return_softmax_lse, + fp8_attention: bool, ): + logger.debug_once("Running FlashAttention prefill new tokens", scope="local") return self._flash_attn_varlen_diff_headdims( q=q, k=k, @@ -1386,11 +1399,23 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) def _run_prefill_new_tokens_fi( - self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse + self, + prefill: MLACommonPrefillMetadata, + q, + k, + v, + return_softmax_lse, + fp8_attention: bool, ): + logger.debug_once("Running FlashInfer prefill new tokens", scope="local") assert isinstance(prefill, FlashInferPrefillMetadata) assert prefill.prefill_main is not None - + if fp8_attention: + logger.debug_once("Running Flashinfer prefill in FP8") + fp8_dtype = current_platform.fp8_dtype() + q = q.to(fp8_dtype) + k = k.to(fp8_dtype) + v = v.to(fp8_dtype) ret = prefill.prefill_main.run( q=q, k=k, @@ -1403,10 +1428,18 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): return ret def _run_prefill_new_tokens_cudnn( - self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse + self, + prefill: MLACommonPrefillMetadata, + q, + k, + v, + return_softmax_lse, + fp8_attention: bool, ): + logger.debug_once("Running Cudnn prefill new tokens", scope="local") assert isinstance(prefill, CudnnPrefillMetadata) assert prefill.query_seq_lens is not None + assert fp8_attention is False, "Cudnn prefill does not support fp8 attention" output, lse = cudnn_batch_prefill_with_kv_cache( q=q, k_cache=k, @@ -1428,9 +1461,19 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): return output def _run_prefill_context_chunk_fa( - self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v + self, + prefill: MLACommonPrefillMetadata, + chunk_idx: int, + q, + k, + v, + fp8_attention: bool, ): + logger.debug_once("Running FlashAttention prefill context chunk", scope="local") assert prefill.chunked_context is not None + assert fp8_attention is False, ( + "FlashAttention prefill does not support fp8 attention" + ) return self._flash_attn_varlen_diff_headdims( q=q, k=k, @@ -1445,10 +1488,22 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) def _run_prefill_context_chunk_fi( - self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v + self, + prefill: MLACommonPrefillMetadata, + chunk_idx: int, + q, + k, + v, + fp8_attention: bool, ): + logger.debug_once("Running FlashInfer prefill context chunk", scope="local") assert isinstance(prefill, FlashInferPrefillMetadata) - + if fp8_attention: + logger.debug_once("Running FlashInfer prefill in FP8") + fp8_dtype = current_platform.fp8_dtype() + q = q.to(fp8_dtype) + k = k.to(fp8_dtype) + v = v.to(fp8_dtype) attn_out, lse = prefill.prefill_chunks[chunk_idx].run( q=q, k=k, @@ -1460,12 +1515,20 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): return attn_out, lse.transpose(0, 1).contiguous() def _run_prefill_context_chunk_cudnn( - self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v + self, + prefill: MLACommonPrefillMetadata, + chunk_idx: int, + q, + k, + v, + fp8_attention: bool, ): + logger.debug_once("Running Cudnn prefill context chunk", scope="local") assert isinstance(prefill, CudnnPrefillMetadata) assert prefill.chunked_context is not None assert prefill.chunked_context.seq_lens[chunk_idx] is not None assert prefill.query_seq_lens is not None + assert fp8_attention is False, "Cudnn prefill does not support fp8 attention" return cudnn_batch_prefill_with_kv_cache( q=q, k_cache=k, @@ -1485,13 +1548,27 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) def _run_prefill_new_tokens_trtllm_ragged( - self, prefill: MLACommonPrefillMetadata, q, k, v, return_softmax_lse + self, + prefill: MLACommonPrefillMetadata, + q, + k, + v, + return_softmax_lse, + fp8_attention: bool, ): + logger.debug_once("Running TRT-LLM ragged prefill new tokens", scope="local") """TRT-LLM ragged attention for new tokens (causal).""" from flashinfer.prefill import trtllm_ragged_attention_deepseek assert prefill.query_seq_lens is not None + if fp8_attention: + logger.debug_once("Running TRT-LLM ragged prefill in FP8") + fp8_dtype = current_platform.fp8_dtype() + q = q.to(fp8_dtype) + k = k.to(fp8_dtype) + v = v.to(fp8_dtype) + ret = trtllm_ragged_attention_deepseek( query=q, key=k, @@ -1518,8 +1595,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): return ret def _run_prefill_context_chunk_trtllm_ragged( - self, prefill: MLACommonPrefillMetadata, chunk_idx: int, q, k, v + self, + prefill: MLACommonPrefillMetadata, + chunk_idx: int, + q, + k, + v, + fp8_attention: bool, ): + logger.debug_once("Running TRT-LLM ragged prefill context chunk", scope="local") """TRT-LLM ragged attention for context chunks (non-causal).""" from flashinfer.prefill import trtllm_ragged_attention_deepseek @@ -1535,6 +1619,13 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) self._workspace_buffer.fill_(0) + if fp8_attention: + logger.debug_once("Running TRT-LLM ragged prefill context chunk in FP8") + fp8_dtype = current_platform.fp8_dtype() + q = q.to(fp8_dtype) + k = k.to(fp8_dtype) + v = v.to(fp8_dtype) + attn_out, lse = trtllm_ragged_attention_deepseek( query=q, key=k, @@ -1687,6 +1778,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): kv_c_and_k_pe_cache: torch.Tensor, attn_metadata: MLACommonMetadata, k_scale: torch.Tensor, + fp8_attention: bool, ): assert attn_metadata.prefill is not None prefill_metadata = attn_metadata.prefill @@ -1725,6 +1817,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): q=q, k=k, v=v, + fp8_attention=fp8_attention, ) if output is None: @@ -1753,6 +1846,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): attn_metadata: MLACommonMetadata, k_scale: torch.Tensor, dcp_world_size: int, + fp8_attention: bool, ): assert k_scale is None, "DCP not support scaled kvcache now." assert attn_metadata.prefill is not None @@ -1829,6 +1923,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): q=q, k=k, v=v, + fp8_attention=fp8_attention, ) if output is None: @@ -1859,6 +1954,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): attn_metadata: MLACommonMetadata, k_scale: torch.Tensor, output: torch.Tensor, + fp8_attention: bool = False, ) -> None: # TODO (zyongye): Prefill function here assert attn_metadata.prefill is not None @@ -1878,6 +1974,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): k=k, v=v, return_softmax_lse=has_context, + fp8_attention=fp8_attention, ) if has_context: @@ -1890,11 +1987,12 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): attn_metadata, k_scale=None, dcp_world_size=self.dcp_world_size, + fp8_attention=fp8_attention, ) ) else: context_output, context_lse = self._compute_prefill_context( - q, kv_c_and_k_pe_cache, attn_metadata, k_scale + q, kv_c_and_k_pe_cache, attn_metadata, k_scale, fp8_attention ) # unpad if necessary @@ -2015,6 +2113,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): attn_metadata, layer._k_scale, output=output[num_decode_tokens:], + fp8_attention=fp8_attention, ) if has_decode: From 95863540538804a48e31ca5806cefbac9d4c5326 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 22 Dec 2025 15:06:29 -0500 Subject: [PATCH 26/31] [Doc] Add vllm-metal to hardware plugin documentation (#31174) Signed-off-by: mgoin --- docs/getting_started/installation/README.md | 1 + docs/getting_started/installation/cpu.apple.inc.md | 3 +++ 2 files changed, 4 insertions(+) diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index 9b93a6b9ac12c..cdbe601ca801a 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -28,3 +28,4 @@ The backends below live **outside** the main `vllm` repository and follow the | Cambricon MLU | `vllm-mlu` | | | Baidu Kunlun XPU | N/A, install from source | | | Sophgo TPU | N/A, install from source | | +| Apple Silicon (Metal) | N/A, install from source | | diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md index 9f1f6e3821397..c5a4d00ddcf4c 100644 --- a/docs/getting_started/installation/cpu.apple.inc.md +++ b/docs/getting_started/installation/cpu.apple.inc.md @@ -4,6 +4,9 @@ vLLM has experimental support for macOS with Apple Silicon. For now, users must Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. +!!! tip "GPU-Accelerated Inference with vLLM-Metal" + For GPU-accelerated inference on Apple Silicon using Metal, check out [vllm-metal](https://github.com/vllm-project/vllm-metal), a community-maintained hardware plugin that uses MLX as the compute backend. + # --8<-- [end:installation] # --8<-- [start:requirements] From de71747655668d268998264365dc9bbc8e58d410 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 22 Dec 2025 16:06:10 -0500 Subject: [PATCH 27/31] [SpecDecode] Simplified alternative padded-speculation acceptance rate fix (#29845) Signed-off-by: Lucas Wilkinson --- tests/v1/spec_decode/test_eagle.py | 10 +++++-- vllm/v1/attention/backends/mla/common.py | 24 ++++++++++------- .../attention/backends/mla/flashattn_mla.py | 5 ++-- vllm/v1/attention/backends/mla/flashmla.py | 2 +- .../attention/backends/mla/rocm_aiter_mla.py | 2 +- vllm/v1/spec_decode/eagle.py | 26 ++++++++++++++++--- vllm/v1/spec_decode/utils.py | 2 ++ vllm/v1/worker/gpu_model_runner.py | 16 +++++++----- 8 files changed, 62 insertions(+), 25 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index f63cd3a6e42aa..a5e326e82c592 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -306,10 +306,16 @@ def test_prepare_inputs_padded(): proposer = _create_proposer("eagle", num_speculative_tokens) - output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded( - common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count + output_metadata, token_indices_to_sample, num_rejected_tokens_gpu = ( + proposer.prepare_inputs_padded( + common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count + ) ) + # Verify num_rejected_tokens_gpu is calculated correctly + expected_num_rejected = torch.tensor([1, 0, 2], dtype=torch.int32, device=device) + assert torch.equal(num_rejected_tokens_gpu, expected_num_rejected) + assert output_metadata.max_query_len == 3 assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc) assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 0ff951213040a..7a6edc0add8bf 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -564,6 +564,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): self.dcp_rank = 0 self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size + self.cp_kv_cache_interleave_size = parallel_config.cp_kv_cache_interleave_size # Don't try to access the runner on AMD if self.aot_schedule: @@ -727,8 +728,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): def _build_decode( self, block_table_tensor: torch.Tensor, - seq_lens_cpu: torch.Tensor, seq_lens_device: torch.Tensor, + max_seq_len: int, query_start_loc_cpu: torch.Tensor, query_start_loc_device: torch.Tensor, num_decode_tokens: int, @@ -778,13 +779,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): query_start_loc = common_attn_metadata.query_start_loc query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu seq_lens = common_attn_metadata.seq_lens - seq_lens_cpu = common_attn_metadata.seq_lens_cpu dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens - dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu - - query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] - - num_computed_tokens_cpu = common_attn_metadata.seq_lens_cpu - query_seq_lens_cpu num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( split_decodes_and_prefills( @@ -799,6 +794,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): prefill_metadata = None if num_prefills > 0: + num_computed_tokens_cpu = common_attn_metadata.num_computed_tokens_cpu + reqs_start = num_decodes # prefill_start context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs] @@ -995,13 +992,22 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): dcp_tot_seq_lens_device = None if self.dcp_world_size > 1: dcp_tot_seq_lens_device = seq_lens[:num_decodes] - seq_lens_cpu = dcp_local_seq_lens_cpu seq_lens = dcp_local_seq_lens + # After DCP distribution, the maximum number of tokens for any rank is + # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size, + # and I is cp_kv_cache_interleave_size. + # This eliminates GPU->CPU sync while minimizing workspace + # over-allocation. + num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size + max_seq_len = ( + (max_seq_len + num_partitions - 1) // num_partitions + ) * self.cp_kv_cache_interleave_size + decode_metadata = self._build_decode( block_table_tensor=block_table_tensor[:num_decodes, ...], - seq_lens_cpu=seq_lens_cpu[:num_decodes], seq_lens_device=seq_lens[:num_decodes], + max_seq_len=max_seq_len, query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1], query_start_loc_device=query_start_loc[: num_decodes + 1], num_decode_tokens=num_decode_tokens, diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index b28814aceada9..b4a68f472e9c1 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -169,8 +169,8 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] def _build_decode( self, block_table_tensor: torch.Tensor, - seq_lens_cpu: torch.Tensor, seq_lens_device: torch.Tensor, + max_seq_len: int, query_start_loc_cpu: torch.Tensor, query_start_loc_device: torch.Tensor, num_decode_tokens: int, @@ -178,7 +178,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] ) -> FlashAttnMLADecodeMetadata: query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] max_query_len = query_lens_cpu.max().item() - max_seq_len = seq_lens_cpu.max().item() # For Flash Attention MLA + full cudagraph max_num_splits = 0 @@ -193,7 +192,7 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] max_num_splits = 1 scheduler_metadata = self._schedule_decode( - num_reqs=seq_lens_cpu.numel(), + num_reqs=seq_lens_device.shape[0], cu_query_lens=query_start_loc_device, max_query_len=max_query_len, seqlens=seq_lens_device, diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 74a4cd8430250..913503ce44944 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -143,8 +143,8 @@ class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]): def _build_decode( self, block_table_tensor: torch.Tensor, - seq_lens_cpu: torch.Tensor, seq_lens_device: torch.Tensor, + max_seq_len: int, query_start_loc_cpu: torch.Tensor, query_start_loc_device: torch.Tensor, num_decode_tokens: int, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 589d6ef2f6348..e8921f8a1c403 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -106,8 +106,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): def _build_decode( self, block_table_tensor: torch.Tensor, - seq_lens_cpu: torch.Tensor, seq_lens_device: torch.Tensor, + max_seq_len: int, query_start_loc_cpu: torch.Tensor, query_start_loc_device: torch.Tensor, num_decode_tokens: int, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 65a0a88ec0f5d..66697132b365c 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -236,6 +236,7 @@ class EagleProposer: common_attn_metadata: CommonAttentionMetadata, sampling_metadata: SamplingMetadata, mm_embed_inputs: tuple[list[torch.Tensor], torch.Tensor] | None = None, + num_rejected_tokens_gpu: torch.Tensor | None = None, ) -> torch.Tensor: num_tokens = target_token_ids.shape[0] batch_size = next_token_ids.shape[0] @@ -414,6 +415,17 @@ class EagleProposer: common_attn_metadata.query_start_loc_cpu = torch.from_numpy( self.token_arange_np[: batch_size + 1] ).clone() + + # In padded drafter batch, we need to adjust the sequence lengths + # to remove the "padding" (i.e. rejected tokens). + # Only apply this adjustment when we have rejected tokens + # (i.e., not the first proposal). + if self.num_speculative_tokens > 1 and num_rejected_tokens_gpu is not None: + common_attn_metadata.seq_lens -= num_rejected_tokens_gpu + # Invalidate the CPU-side shadows to avoid H<>D sync. + common_attn_metadata._seq_lens_cpu = None + common_attn_metadata._num_computed_tokens_cpu = None + for token_index in range(self.num_speculative_tokens - 1): # Update the inputs. # cast to int32 is crucial when eagle model is compiled. @@ -628,13 +640,14 @@ class EagleProposer: common_attn_metadata: CommonAttentionMetadata, spec_decode_metadata: SpecDecodeMetadata, valid_sampled_tokens_count: torch.Tensor, - ) -> tuple[CommonAttentionMetadata, torch.Tensor]: + ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]: """ This function is used to prepare the inputs for speculative decoding It updates the common_attn_metadata for speculative decoding, but does not consider the rejected tokens. Instead, all tokens are included as inputs to the speculator, with the rejected tokens used as padding and filtered out later by `token_indices_to_sample`. + No blocking CPU operations should be introduced in this function. """ num_reqs = common_attn_metadata.num_reqs device = valid_sampled_tokens_count.device @@ -642,14 +655,17 @@ class EagleProposer: token_indices_to_sample = torch.empty( (num_reqs,), dtype=torch.int32, device=device ) + num_rejected_tokens_gpu = torch.empty( + (num_reqs,), dtype=torch.int32, device=device + ) - # Kernel grid: one program per request (row) grid = (num_reqs,) eagle_prepare_inputs_padded_kernel[grid]( spec_decode_metadata.cu_num_draft_tokens, valid_sampled_tokens_count, common_attn_metadata.query_start_loc, token_indices_to_sample, + num_rejected_tokens_gpu, num_reqs, ) @@ -674,7 +690,11 @@ class EagleProposer: dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens, ) - return spec_common_attn_metadata, token_indices_to_sample + return ( + spec_common_attn_metadata, + token_indices_to_sample, + num_rejected_tokens_gpu, + ) def propose_tree( self, diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py index 9d4399d00487a..783b6ed5961bc 100644 --- a/vllm/v1/spec_decode/utils.py +++ b/vllm/v1/spec_decode/utils.py @@ -23,6 +23,7 @@ def eagle_prepare_inputs_padded_kernel( valid_sampled_tokens_count_ptr, # [num_reqs] query_start_loc_gpu_ptr, # [num_reqs + 1] token_indices_to_sample_ptr, # [num_reqs] (output) + num_rejected_tokens_gpu_ptr, # [num_reqs] (output) num_reqs, # tl.int32 ): """ @@ -56,6 +57,7 @@ def eagle_prepare_inputs_padded_kernel( index_to_sample = q_last_tok_idx - num_rejected_tokens tl.store(token_indices_to_sample_ptr + req_idx, index_to_sample) + tl.store(num_rejected_tokens_gpu_ptr + req_idx, num_rejected_tokens) @triton.jit diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0a17923e89989..455406394d3ec 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3534,6 +3534,7 @@ class GPUModelRunner( next_token_ids, valid_sampled_tokens_count ) + num_rejected_tokens_gpu = None if spec_decode_metadata is None: token_indices_to_sample = None # input_ids can be None for multimodal models. @@ -3564,12 +3565,14 @@ class GPUModelRunner( else: target_hidden_states = hidden_states[token_indices] else: - common_attn_metadata, token_indices_to_sample = ( - self.drafter.prepare_inputs_padded( - common_attn_metadata, - spec_decode_metadata, - valid_sampled_tokens_count, - ) + ( + common_attn_metadata, + token_indices_to_sample, + num_rejected_tokens_gpu, + ) = self.drafter.prepare_inputs_padded( + common_attn_metadata, + spec_decode_metadata, + valid_sampled_tokens_count, ) total_num_tokens = common_attn_metadata.num_actual_tokens # When padding the batch, token_indices is just a range @@ -3600,6 +3603,7 @@ class GPUModelRunner( sampling_metadata=sampling_metadata, common_attn_metadata=common_attn_metadata, mm_embed_inputs=mm_embed_inputs, + num_rejected_tokens_gpu=num_rejected_tokens_gpu, ) return draft_token_ids From 5312a7284e58e47bd10f7acbab52f703714027c2 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Mon, 22 Dec 2025 17:24:27 -0500 Subject: [PATCH 28/31] [Bug] Fix `'CutlassMLAImpl' object has no attribute '_workspace_buffer'` (#31173) Signed-off-by: yewentao256 --- vllm/v1/attention/backends/mla/common.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 7a6edc0add8bf..619c1c2794daa 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -355,6 +355,7 @@ class MLACommonPrefillMetadata: max_query_len: int chunked_context: ChunkedContextMetadata | None = None query_seq_lens: torch.Tensor | None = None + workspace_buffer: torch.Tensor | None = None q_data_type: torch.dtype | None = None @@ -986,6 +987,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): prefill_metadata.query_seq_lens = ( prefill_query_start_loc[1:] - prefill_query_start_loc[:-1] ) + prefill_metadata.workspace_buffer = self._workspace_buffer decode_metadata = None if num_decodes > 0: @@ -1567,6 +1569,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): from flashinfer.prefill import trtllm_ragged_attention_deepseek assert prefill.query_seq_lens is not None + assert prefill.workspace_buffer is not None if fp8_attention: logger.debug_once("Running TRT-LLM ragged prefill in FP8") @@ -1579,7 +1582,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): query=q, key=k, value=v, - workspace_buffer=self._workspace_buffer, + workspace_buffer=prefill.workspace_buffer, seq_lens=prefill.query_seq_lens, max_q_len=prefill.max_query_len, max_kv_len=prefill.max_query_len, @@ -1615,6 +1618,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): assert prefill.chunked_context is not None assert prefill.chunked_context.seq_lens[chunk_idx] is not None + assert prefill.workspace_buffer is not None out = torch.zeros( q.shape[0], @@ -1623,7 +1627,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): device=q.device, dtype=q.dtype, ) - self._workspace_buffer.fill_(0) + prefill.workspace_buffer.fill_(0) if fp8_attention: logger.debug_once("Running TRT-LLM ragged prefill context chunk in FP8") @@ -1636,7 +1640,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): query=q, key=k, value=v, - workspace_buffer=self._workspace_buffer, + workspace_buffer=prefill.workspace_buffer, seq_lens=prefill.chunked_context.seq_lens[chunk_idx], max_q_len=prefill.max_query_len, max_kv_len=prefill.chunked_context.max_seq_lens[chunk_idx], From 85aff45e24de7af96d30baa1d7d0fc7aec43c28a Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Mon, 22 Dec 2025 17:25:22 -0500 Subject: [PATCH 29/31] [Perf] Remove blocking copy in GDN Attention (#31167) Signed-off-by: Benjamin Chislett --- vllm/v1/attention/backends/gdn_attn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index ace2cbb0564c8..fcde986f48d46 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -143,7 +143,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] query_start_loc = m.query_start_loc context_lens = m.num_computed_tokens_cpu - context_lens_tensor = context_lens.to(query_start_loc.device) + context_lens_tensor = context_lens.to(query_start_loc.device, non_blocking=True) nums_dict, batch_ptr, token_chunk_offset_ptr = None, None, None if ( From 6d518ffbaa04d3e930b320830542b6baf49aa61c Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 22 Dec 2025 18:40:35 -0500 Subject: [PATCH 30/31] [CI Failure] Disable mosaicml/mpt-7b and databricks/dbrx-instruct tests (#31182) Signed-off-by: mgoin --- docs/serving/integrations/langchain.md | 2 +- tests/models/registry.py | 8 ++++++-- tests/tokenizers_/test_detokenize.py | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md index 192a61ea5b903..14b336dffa78f 100644 --- a/docs/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -16,7 +16,7 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain` from langchain_community.llms import VLLM llm = VLLM( - model="mosaicml/mpt-7b", + model="Qwen/Qwen3-4B", trust_remote_code=True, # mandatory for hf models max_new_tokens=128, top_k=10, diff --git a/tests/models/registry.py b/tests/models/registry.py index 82b9303b2a21b..081167c6aedf7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -215,7 +215,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { trust_remote_code=True, ), "CwmForCausalLM": _HfExamplesInfo("facebook/cwm", min_transformers_version="4.58"), - "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"), + # FIXME: databricks/dbrx-instruct has been deleted + "DbrxForCausalLM": _HfExamplesInfo( + "databricks/dbrx-instruct", is_available_online=False + ), "DeciLMForCausalLM": _HfExamplesInfo( "nvidia/Llama-3_3-Nemotron-Super-49B-v1", trust_remote_code=True, @@ -366,7 +369,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { {"tiny": "TitanML/tiny-mixtral"}, ), "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False), - "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), + # FIXME: mosaicml/mpt-7b has been deleted + "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b", is_available_online=False), "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"), "NemotronHForCausalLM": _HfExamplesInfo( "nvidia/Nemotron-H-8B-Base-8K", trust_remote_code=True diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py index d307993d04df9..ad6c5fb415aad 100644 --- a/tests/tokenizers_/test_detokenize.py +++ b/tests/tokenizers_/test_detokenize.py @@ -38,7 +38,8 @@ TOKENIZERS = [ "EleutherAI/gpt-j-6b", "EleutherAI/pythia-70m", "bigscience/bloom-560m", - "mosaicml/mpt-7b", + # FIXME: mosaicml/mpt-7b has been deleted + # "mosaicml/mpt-7b", "tiiuae/falcon-7b", "meta-llama/Llama-3.2-1B-Instruct", "codellama/CodeLlama-7b-hf", From b57b967386e1962955c93b7cb39828448b68789b Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 22 Dec 2025 18:42:58 -0500 Subject: [PATCH 31/31] [MoE Refactor][7/N] AITER MK (#31102) Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw --- .../layers/fused_moe/fused_moe.py | 6 +- .../layers/fused_moe/prepare_finalize.py | 9 ++ .../layers/fused_moe/rocm_aiter_fused_moe.py | 79 ++++++++++++ .../model_executor/layers/quantization/fp8.py | 116 ++++++++---------- 4 files changed, 144 insertions(+), 66 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index c8d80ae023d43..bf51554341607 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -2132,6 +2132,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): torch.float16, torch.bfloat16, torch.float8_e4m3fn, + torch.float8_e4m3fnuz, ] E, num_tokens, N, K, top_k_num = self.moe_problem_size( @@ -2156,7 +2157,10 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): compute_type = tl.float16 elif hidden_states.dtype == torch.float32: compute_type = tl.float32 - elif hidden_states.dtype == torch.float8_e4m3fn: + elif ( + hidden_states.dtype == torch.float8_e4m3fn + or hidden_states.dtype == torch.float8_e4m3fnuz + ): compute_type = tl.bfloat16 else: raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index e27e2eb32da0f..5d806fa843a3c 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -13,6 +13,10 @@ from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): + def __init__(self, defer_input_quant: bool = False) -> None: + super().__init__() + self.defer_input_quant = defer_input_quant + @property def activation_format(self) -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard @@ -48,6 +52,11 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): # Note: do not use inplace for shared experts overlap a1 = a1 * topk_weights.to(a1.dtype) + # Defer input quant to moe kernel for backends (e.g. AITER, FI) + # which use a single kernel call for quant + experts. + if self.defer_input_quant: + return a1, None, None, None, None + a1q, a1q_scale = moe_kernel_quantize_input( a1, quant_config.a1_scale, diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 882ad0a537cd5..ebd9e3a4a8f2a 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -5,11 +5,15 @@ from functools import lru_cache import torch +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._aiter_ops import rocm_aiter_ops from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig, ) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) class QuantMethod(IntEnum): @@ -263,3 +267,78 @@ def rocm_aiter_fused_experts( a2_scale=quant_config.a2_scale, doweight_stage1=apply_router_weight_on_input, ) + + +class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__(self, quant_config): + super().__init__(quant_config) + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_expert_map(self): + return True + + def supports_chunking(self): + return False + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # Workspaces are managed internally by AITER. + workspace1 = (0,) + workspace2 = (0,) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert a1q_scale is None + assert a2_scale is None + assert expert_tokens_meta is None + + result = rocm_aiter_fused_experts( + hidden_states=hidden_states, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + quant_config=self.quant_config, + ) + assert result.shape == output.shape + output.copy_(result) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index d19b20798ed06..9da19c082dc27 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -117,6 +117,7 @@ class Fp8MoeBackend(Enum): DEEPGEMM = 3 MARLIN = 4 TRITON = 5 + AITER = 6 def get_fp8_moe_backend( @@ -189,6 +190,10 @@ def get_fp8_moe_backend( logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local") return Fp8MoeBackend.DEEPGEMM + if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MOE: + logger.info_once("Using ROCm AITER backend for FP8 MoE", scope="local") + return Fp8MoeBackend.AITER + # default to Triton logger.info_once("Using Triton backend for FP8 MoE") return Fp8MoeBackend.TRITON @@ -888,16 +893,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.w13_input_scale = None layer.w2_input_scale = None - self.rocm_aiter_moe_enabled = False - def process_weights_after_loading(self, layer: Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return - # Lazy import to avoid importing triton too early. - - self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - # TODO (rob): refactor block quant into separate class. if self.block_quant: assert self.quant_config.activation_scheme == "dynamic" @@ -932,7 +931,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv) replace_parameter(layer, "w2_weight", w2_weight) replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv) - if self.rocm_aiter_moe_enabled: + if self.fp8_backend == Fp8MoeBackend.AITER: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight.data, layer.w2_weight.data @@ -1026,7 +1025,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) start += shard_size - if self.rocm_aiter_moe_enabled: + if self.fp8_backend == Fp8MoeBackend.AITER: shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight, layer.w2_weight ) @@ -1072,6 +1071,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): self.moe_quant_config = config self.kernel = mk.FusedMoEModularKernel( + # TODO(rob): we can use the generic MoEPrepareAndFinalizeNoEP + # with the changes to defer input quantization FlashInferAllGatherMoEPrepareAndFinalize( use_dp=(self.moe.dp_size > 1), use_deepseek_fp8_block_scale=self.block_quant, @@ -1093,6 +1094,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): Fp8MoeBackend.DEEPGEMM, Fp8MoeBackend.TRITON, Fp8MoeBackend.MARLIN, + Fp8MoeBackend.AITER, ]: from vllm.model_executor.layers.fused_moe import ( TritonOrDeepGemmExperts, @@ -1103,24 +1105,33 @@ class Fp8MoEMethod(FusedMoEMethodBase): from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP, ) + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + AiterExperts, + ) config = self.get_fused_moe_quant_config(layer) assert config is not None self.moe_quant_config = config - use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN - allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM - moe_kernel = ( - MarlinExperts(quant_config=self.moe_quant_config) - if use_marlin - else TritonOrDeepGemmExperts( - quant_config=self.moe_quant_config, - allow_deep_gemm=allow_deep_gemm, - ) - ) - self.kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), moe_kernel - ) + if self.fp8_backend == Fp8MoeBackend.AITER: + self.kernel = mk.FusedMoEModularKernel( + # TODO: make defer_input_quant an attr of the AiterExperts + MoEPrepareAndFinalizeNoEP(defer_input_quant=True), + AiterExperts(quant_config=self.moe_quant_config), + ) + elif self.fp8_backend == Fp8MoeBackend.MARLIN: + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + MarlinExperts(quant_config=self.moe_quant_config), + ) + else: + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonOrDeepGemmExperts( + quant_config=self.moe_quant_config, + allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM), + ), + ) self.use_inplace = True def maybe_make_prepare_finalize( @@ -1128,7 +1139,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> mk.FusedMoEPrepareAndFinalize | None: if ( - self.rocm_aiter_moe_enabled + self.fp8_backend == Fp8MoeBackend.AITER or self.fp8_backend == Fp8MoeBackend.MARLIN or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): @@ -1161,11 +1172,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): TritonOrDeepGemmExperts, ) - assert ( - self.fp8_backend != Fp8MoeBackend.MARLIN - ) and not self.rocm_aiter_moe_enabled, ( - "Marlin and ROCm AITER are not supported with all2all yet." - ) + if self.fp8_backend in [Fp8MoeBackend.MARLIN, Fp8MoeBackend.AITER]: + raise NotImplementedError( + "Marlin and ROCm AITER are not supported with all2all yet." + ) assert self.moe_quant_config is not None @@ -1313,37 +1323,18 @@ class Fp8MoEMethod(FusedMoEMethodBase): hidden_states=x, router_logits=router_logits, ) - - if self.rocm_aiter_moe_enabled: - from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 - rocm_aiter_fused_experts, - ) - - # TODO(rob): convert this to MK. - result = rocm_aiter_fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - activation=layer.activation, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - expert_map=layer.expert_map, - quant_config=self.moe_quant_config, - ) - else: - result = self.kernel( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - inplace=self.use_inplace, - activation=layer.activation, - global_num_experts=layer.global_num_experts, - expert_map=layer.expert_map, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - ) + result = self.kernel( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights, + topk_ids, + inplace=self.use_inplace, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + ) return result @@ -1456,15 +1447,10 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): layer.w13_input_scale = None layer.w2_input_scale = None - self.rocm_aiter_moe_enabled = False - def process_weights_after_loading(self, layer: Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return - # Lazy import to avoid importing triton too early. - self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - # If checkpoint is fp16, quantize in place. fp8_dtype = current_platform.fp8_dtype() w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) @@ -1481,7 +1467,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): replace_parameter(layer, "w2_weight", w2_weight) # Reshuffle weights for AITER if needed. - if self.rocm_aiter_moe_enabled: + if self.fp8_backend == Fp8MoeBackend.AITER: shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight, layer.w2_weight ) @@ -1489,7 +1475,7 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): replace_parameter(layer, "w2_weight", shuffled_w2) # Rushuffle weights for MARLIN if needed. - if self.fp8_backend == Fp8MoeBackend.MARLIN: + elif self.fp8_backend == Fp8MoeBackend.MARLIN: prepare_moe_fp8_layer_for_marlin( layer, False, input_dtype=self.marlin_input_dtype )