From e458270a9537c5abc1d848f53f2d56fce92a6122 Mon Sep 17 00:00:00 2001 From: "Ye (Charlotte) Qi" Date: Thu, 11 Dec 2025 12:06:09 -0800 Subject: [PATCH 01/57] [Misc] Add mcp to requirements (#30474) Signed-off-by: Ye (Charlotte) Qi --- requirements/common.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index f18560b98d16c..31c8fb404f63a 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser setproctitle # Used to set process names for better debugging and monitoring openai-harmony >= 0.0.3 # Required for gpt-oss anthropic == 0.71.0 -model-hosting-container-standards >= 0.1.9, < 1.0.0 \ No newline at end of file +model-hosting-container-standards >= 0.1.9, < 1.0.0 +mcp \ No newline at end of file From 92fea56fd1e148a5650160427d6b5c733ff211b8 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Thu, 11 Dec 2025 15:28:03 -0500 Subject: [PATCH 02/57] [compile] Stop one-off setting enable_aot_compile and use context manager instead. (#30503) Signed-off-by: zhxchen17 --- vllm/compilation/wrapper.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index b59a4a9dd1527..02e974b0f9e8c 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -171,22 +171,24 @@ class TorchCompileWithNoGuardsWrapper: compiled_ptr = self.check_invariants_and_forward + aot_context = nullcontext() if envs.VLLM_USE_AOT_COMPILE: if hasattr(torch._dynamo.config, "enable_aot_compile"): - torch._dynamo.config.enable_aot_compile = True + aot_context = torch._dynamo.config.patch(enable_aot_compile=True) else: msg = "torch._dynamo.config.enable_aot_compile is not " msg += "available. AOT compile is disabled and please " msg += "upgrade PyTorch version to use AOT compile." logger.warning(msg) - self._compiled_callable = torch.compile( - compiled_ptr, - fullgraph=True, - dynamic=False, - backend=backend, - options=options, - ) + with aot_context: + self._compiled_callable = torch.compile( + compiled_ptr, + fullgraph=True, + dynamic=False, + backend=backend, + options=options, + ) if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE: torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook) From cf3eacfe58fa9e745c2854782ada884a9f992cf7 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Dec 2025 20:45:23 +0000 Subject: [PATCH 03/57] Standardise `get_rope` to use `rope_parameters["partial_rotary_factor"]`, not `rotary_dim` (#30389) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- benchmarks/kernels/benchmark_mrope.py | 1 - benchmarks/kernels/benchmark_rope.py | 4 +- tests/compile/test_functionalization.py | 5 +- tests/kernels/core/test_mrope.py | 2 - tests/kernels/core/test_pos_encoding.py | 12 +- vllm/config/utils.py | 18 +- .../layers/rotary_embedding/__init__.py | 370 +++++++++--------- vllm/model_executor/models/afmoe.py | 1 - vllm/model_executor/models/apertus.py | 1 - vllm/model_executor/models/arctic.py | 1 - vllm/model_executor/models/baichuan.py | 1 - vllm/model_executor/models/bailing_moe.py | 4 +- vllm/model_executor/models/bamba.py | 7 +- vllm/model_executor/models/chameleon.py | 1 - vllm/model_executor/models/chatglm.py | 7 +- vllm/model_executor/models/commandr.py | 1 - vllm/model_executor/models/config.py | 12 +- vllm/model_executor/models/dbrx.py | 1 - vllm/model_executor/models/deepseek_v2.py | 4 - vllm/model_executor/models/dots1.py | 1 - vllm/model_executor/models/ernie45_moe.py | 1 - vllm/model_executor/models/exaone.py | 1 - vllm/model_executor/models/exaone4.py | 1 - vllm/model_executor/models/falcon.py | 1 - vllm/model_executor/models/falcon_h1.py | 7 +- vllm/model_executor/models/gemma.py | 1 - vllm/model_executor/models/gemma2.py | 1 - vllm/model_executor/models/gemma3.py | 1 - vllm/model_executor/models/gemma3n.py | 1 - vllm/model_executor/models/glm4.py | 2 - vllm/model_executor/models/glm4_1v.py | 2 +- vllm/model_executor/models/glm4_moe.py | 1 - vllm/model_executor/models/gpt_j.py | 5 +- vllm/model_executor/models/gpt_neox.py | 1 - vllm/model_executor/models/gpt_oss.py | 1 - vllm/model_executor/models/granite.py | 1 - vllm/model_executor/models/granitemoe.py | 1 - .../model_executor/models/granitemoehybrid.py | 1 - vllm/model_executor/models/grok1.py | 1 - vllm/model_executor/models/hunyuan_v1.py | 2 - vllm/model_executor/models/internlm2.py | 1 - vllm/model_executor/models/lfm2.py | 1 - vllm/model_executor/models/lfm2_moe.py | 1 - vllm/model_executor/models/llama.py | 1 - vllm/model_executor/models/llama4.py | 1 - vllm/model_executor/models/minicpm.py | 1 - vllm/model_executor/models/minicpm3.py | 1 - vllm/model_executor/models/minimax_m2.py | 6 +- vllm/model_executor/models/minimax_text_01.py | 7 +- vllm/model_executor/models/mixtral.py | 1 - vllm/model_executor/models/mllama4.py | 2 +- vllm/model_executor/models/modernbert.py | 1 - vllm/model_executor/models/molmo.py | 1 - vllm/model_executor/models/nemotron.py | 1 - vllm/model_executor/models/nemotron_nas.py | 1 - vllm/model_executor/models/olmo.py | 1 - vllm/model_executor/models/olmo2.py | 1 - vllm/model_executor/models/olmoe.py | 1 - vllm/model_executor/models/openpangu.py | 2 - vllm/model_executor/models/orion.py | 1 - vllm/model_executor/models/ouro.py | 1 - vllm/model_executor/models/persimmon.py | 1 - vllm/model_executor/models/phi.py | 12 +- vllm/model_executor/models/phimoe.py | 1 - vllm/model_executor/models/plamo2.py | 1 - vllm/model_executor/models/plamo3.py | 1 - vllm/model_executor/models/qwen.py | 1 - vllm/model_executor/models/qwen2.py | 1 - vllm/model_executor/models/qwen2_5_vl.py | 2 +- vllm/model_executor/models/qwen2_moe.py | 1 - vllm/model_executor/models/qwen2_vl.py | 2 +- vllm/model_executor/models/qwen3.py | 1 - vllm/model_executor/models/qwen3_moe.py | 1 - vllm/model_executor/models/qwen3_next.py | 1 - .../models/qwen3_omni_moe_thinker.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 2 +- vllm/model_executor/models/seed_oss.py | 1 - vllm/model_executor/models/solar.py | 1 - vllm/model_executor/models/stablelm.py | 1 - vllm/model_executor/models/starcoder2.py | 1 - vllm/model_executor/models/step3_text.py | 1 - vllm/model_executor/models/zamba2.py | 1 - vllm/transformers_utils/config.py | 17 +- 83 files changed, 260 insertions(+), 314 deletions(-) diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 83bd91917508f..09de5fa822f86 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -99,7 +99,6 @@ def benchmark_mrope( # the parameters to compute the q k v size based on tp_size mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=rope_parameters, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 074b7a440b612..7a1bc050bb33f 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -32,8 +32,8 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device): def benchmark(batch_size, seq_len, num_heads, provider): dtype = torch.bfloat16 max_position = 8192 - base = 10000 - rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style) + rope_parameters = {"partial_rotary_factor": rotary_dim / head_size} + rope = get_rope(head_size, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=device) cos_sin_cache = rope.cos_sin_cache.to(dtype=torch.float, device=device) diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 7585915892700..ad5ead36e2310 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module): class TestRotaryEmbedding(torch.nn.Module): - def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000): + def __init__(self, head_dim=64, max_position=2048, base=10000): super().__init__() self.head_dim = head_dim - self.rotary_dim = rotary_dim or head_dim self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.rotary_dim, max_position=max_position, rope_parameters={"rope_type": "default", "rope_theta": base}, ) @@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters={"rope_type": "default", "rope_theta": base}, ) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index 4e1559a049bf9..ba5d593b2d355 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -116,7 +116,6 @@ def test_mrope( mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=config.rope_parameters, @@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing( mrope_helper_class = get_rope( head_size=head_dim, - rotary_dim=head_dim, max_position=max_position, is_neox_style=is_neox_style, rope_parameters=config.rope_parameters, diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index a8ed3825689d3..d18f01314c8f5 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -83,8 +83,12 @@ def test_rotary_embedding( torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size - rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters) + rope_parameters = { + "rope_type": "default", + "rope_theta": rope_theta, + "partial_rotary_factor": rotary_dim / head_size, + } + rope = get_rope(head_size, max_position, is_neox_style, rope_parameters) rope = rope.to(dtype=dtype, device=torch.get_default_device()) positions = torch.randint(0, max_position, (batch_size, seq_len)) @@ -150,9 +154,9 @@ def test_rope_module_cache(): if rotary_dim is None: rotary_dim = head_size rope_parameters["rope_theta"] = rope_theta + rope_parameters["partial_rotary_factor"] = rotary_dim / head_size rope = get_rope( head_size, - rotary_dim, max_position, is_neox_style, rope_parameters, @@ -177,9 +181,9 @@ def test_rope_module_cache(): if rotary_dim is None: rotary_dim = head_size rope_parameters["rope_theta"] = rope_theta + rope_parameters["partial_rotary_factor"] = rotary_dim / head_size rope = get_rope( head_size, - rotary_dim, max_position, is_neox_style, rope_parameters, diff --git a/vllm/config/utils.py b/vllm/config/utils.py index 93da3fd417ace..470296517deb1 100644 --- a/vllm/config/utils.py +++ b/vllm/config/utils.py @@ -73,14 +73,28 @@ def get_field(cls: ConfigType, name: str) -> Field: ) -def getattr_iter(object: object, names: Iterable[str], default: Any) -> Any: +def getattr_iter( + object: object, names: Iterable[str], default: Any, warn: bool = False +) -> Any: """ A helper function that retrieves an attribute from an object which may have multiple possible names. This is useful when fetching attributes from arbitrary `transformers.PretrainedConfig` instances. + + In the case where the first name in `names` is the preferred name, and + any other names are deprecated aliases, setting `warn=True` will log a + warning when a deprecated name is used. """ - for name in names: + for i, name in enumerate(names): if hasattr(object, name): + if warn and i > 0: + logger.warning_once( + "%s contains a deprecated attribute name '%s'. " + "Please use the preferred attribute name '%s' instead.", + type(object).__name__, + name, + names[0], + ) return getattr(object, name) return default diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 4dff984f92be6..452b87ea4e7a5 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -25,7 +25,6 @@ _ROPE_DICT: dict[tuple, RotaryEmbedding] = {} def get_rope( head_size: int, - rotary_dim: int, max_position: int, is_neox_style: bool = True, rope_parameters: dict[str, Any] | None = None, @@ -54,12 +53,15 @@ def get_rope( else: dual_chunk_attention_args = None - partial_rotary_factor = 1.0 - if rope_parameters is not None: - partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) + rope_parameters = rope_parameters or {} + base = rope_parameters.get("rope_theta", 10000) + scaling_type = rope_parameters.get("rope_type", "default") + partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) + + if partial_rotary_factor <= 0.0 or partial_rotary_factor > 1.0: + raise ValueError(f"{partial_rotary_factor=} must be between 0.0 and 1.0") + rotary_dim = int(head_size * partial_rotary_factor) - if partial_rotary_factor < 1.0: - rotary_dim = int(rotary_dim * partial_rotary_factor) key = ( head_size, rotary_dim, @@ -72,7 +74,6 @@ def get_rope( if key in _ROPE_DICT: return _ROPE_DICT[key] - base = rope_parameters["rope_theta"] if rope_parameters else 10000 if dual_chunk_attention_config is not None: extra_kwargs = { k: v @@ -88,109 +89,76 @@ def get_rope( dtype, **extra_kwargs, ) - elif not rope_parameters: - rotary_emb = RotaryEmbedding( + elif scaling_type == "default": + if "mrope_section" in rope_parameters: + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), + ) + else: + rotary_emb = RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + ) + elif scaling_type == "llama3": + scaling_factor = rope_parameters["factor"] + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] + rotary_emb = Llama3RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype, + scaling_factor, + low_freq_factor, + high_freq_factor, + original_max_position, + ) + elif scaling_type == "mllama4": + rotary_emb = Llama4VisionRotaryEmbedding( head_size, rotary_dim, max_position, base, is_neox_style, dtype ) - else: - scaling_type = rope_parameters["rope_type"] - - if scaling_type == "llama3": - scaling_factor = rope_parameters["factor"] - low_freq_factor = rope_parameters["low_freq_factor"] - high_freq_factor = rope_parameters["high_freq_factor"] - original_max_position = rope_parameters["original_max_position_embeddings"] - rotary_emb = Llama3RotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - scaling_factor, - low_freq_factor, - high_freq_factor, - original_max_position, - ) - elif scaling_type == "mllama4": - rotary_emb = Llama4VisionRotaryEmbedding( - head_size, rotary_dim, max_position, base, is_neox_style, dtype - ) - elif scaling_type == "default": - if "mrope_section" in rope_parameters: - rotary_emb = MRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - mrope_section=rope_parameters["mrope_section"], - mrope_interleaved=rope_parameters.get("mrope_interleaved", False), - ) - else: - rotary_emb = RotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - dtype, - ) - elif scaling_type == "linear": - scaling_factor = rope_parameters["factor"] - rotary_emb = LinearScalingRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_factor, - dtype, - ) - elif scaling_type == "ntk": - scaling_factor = rope_parameters["factor"] - mixed_b = rope_parameters.get("mixed_b") - rotary_emb = NTKScalingRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_factor, - dtype, - mixed_b, - ) - elif scaling_type == "dynamic": - if "alpha" in rope_parameters: - scaling_alpha = rope_parameters["alpha"] - rotary_emb = DynamicNTKAlphaRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_alpha, - dtype, - ) - elif "factor" in rope_parameters: - scaling_factor = rope_parameters["factor"] - rotary_emb = DynamicNTKScalingRotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_factor, - dtype, - ) - else: - raise ValueError( - "Dynamic rope scaling must contain either 'alpha' or 'factor' field" - ) - elif scaling_type == "xdrope": + elif scaling_type == "linear": + scaling_factor = rope_parameters["factor"] + rotary_emb = LinearScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + ) + elif scaling_type == "ntk": + scaling_factor = rope_parameters["factor"] + mixed_b = rope_parameters.get("mixed_b") + rotary_emb = NTKScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + mixed_b, + ) + elif scaling_type == "dynamic": + if "alpha" in rope_parameters: scaling_alpha = rope_parameters["alpha"] - rotary_emb = XDRotaryEmbedding( + rotary_emb = DynamicNTKAlphaRotaryEmbedding( head_size, rotary_dim, max_position, @@ -198,67 +166,66 @@ def get_rope( is_neox_style, scaling_alpha, dtype, - xdrope_section=rope_parameters["xdrope_section"], ) - elif scaling_type == "yarn": + elif "factor" in rope_parameters: scaling_factor = rope_parameters["factor"] - original_max_position = rope_parameters["original_max_position_embeddings"] - extra_kwargs = { - k: v - for k, v in rope_parameters.items() - if k - in ( - "extrapolation_factor", - "attn_factor", - "beta_fast", - "beta_slow", - "apply_yarn_scaling", - "truncate", - ) - } - if "mrope_section" in rope_parameters: - extra_kwargs.pop("apply_yarn_scaling", None) - rotary_emb = MRotaryEmbedding( - head_size, - rotary_dim, - original_max_position, - base, - is_neox_style, - dtype, - mrope_section=rope_parameters["mrope_section"], - mrope_interleaved=rope_parameters.get("mrope_interleaved", False), - scaling_factor=scaling_factor, - **extra_kwargs, - ) - else: - rotary_emb = YaRNScalingRotaryEmbedding( - head_size, - rotary_dim, - original_max_position, - base, - is_neox_style, - scaling_factor, - dtype, - **extra_kwargs, - ) - elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]: - scaling_factor = rope_parameters["factor"] - original_max_position = rope_parameters["original_max_position_embeddings"] - # assert max_position == original_max_position * scaling_factor - extra_kwargs = { - k: v - for k, v in rope_parameters.items() - if k - in ( - "extrapolation_factor", - "attn_factor", - "beta_fast", - "beta_slow", - "mscale", - "mscale_all_dim", - ) - } - rotary_emb = DeepseekScalingRotaryEmbedding( + rotary_emb = DynamicNTKScalingRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + dtype, + ) + else: + raise ValueError( + "Dynamic rope scaling must contain either 'alpha' or 'factor' field" + ) + elif scaling_type == "xdrope": + scaling_alpha = rope_parameters["alpha"] + rotary_emb = XDRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_alpha, + dtype, + xdrope_section=rope_parameters["xdrope_section"], + ) + elif scaling_type == "yarn": + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_parameters.items() + if k + in ( + "extrapolation_factor", + "attn_factor", + "beta_fast", + "beta_slow", + "apply_yarn_scaling", + "truncate", + ) + } + if "mrope_section" in rope_parameters: + extra_kwargs.pop("apply_yarn_scaling", None) + rotary_emb = MRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + dtype, + mrope_section=rope_parameters["mrope_section"], + mrope_interleaved=rope_parameters.get("mrope_interleaved", False), + scaling_factor=scaling_factor, + **extra_kwargs, + ) + else: + rotary_emb = YaRNScalingRotaryEmbedding( head_size, rotary_dim, original_max_position, @@ -268,28 +235,55 @@ def get_rope( dtype, **extra_kwargs, ) - elif scaling_type == "longrope": - short_factor = rope_parameters["short_factor"] - long_factor = rope_parameters["long_factor"] - original_max_position = rope_parameters["original_max_position_embeddings"] - extra_kwargs = { - k: v - for k, v in rope_parameters.items() - if k in ("short_mscale", "long_mscale") - } - rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( - head_size, - rotary_dim, - max_position, - original_max_position, - base, - is_neox_style, - dtype, - short_factor, - long_factor, - **extra_kwargs, + elif scaling_type in ["deepseek_yarn", "deepseek_llama_scaling"]: + scaling_factor = rope_parameters["factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] + # assert max_position == original_max_position * scaling_factor + extra_kwargs = { + k: v + for k, v in rope_parameters.items() + if k + in ( + "extrapolation_factor", + "attn_factor", + "beta_fast", + "beta_slow", + "mscale", + "mscale_all_dim", ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + } + rotary_emb = DeepseekScalingRotaryEmbedding( + head_size, + rotary_dim, + original_max_position, + base, + is_neox_style, + scaling_factor, + dtype, + **extra_kwargs, + ) + elif scaling_type == "longrope": + short_factor = rope_parameters["short_factor"] + long_factor = rope_parameters["long_factor"] + original_max_position = rope_parameters["original_max_position_embeddings"] + extra_kwargs = { + k: v + for k, v in rope_parameters.items() + if k in ("short_mscale", "long_mscale") + } + rotary_emb = Phi3LongRoPEScaledRotaryEmbedding( + head_size, + rotary_dim, + max_position, + original_max_position, + base, + is_neox_style, + dtype, + short_factor, + long_factor, + **extra_kwargs, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") _ROPE_DICT[key] = rotary_emb return rotary_emb diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 85827d54c911a..3ced52c2050d6 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -241,7 +241,6 @@ class AfmoeAttention(nn.Module): if self.is_local_attention: self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config["rope_parameters"], is_neox_style=True, diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 2a8be29d8d306..e3f97a718b0f4 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -226,7 +226,6 @@ class ApertusAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 266d29a8d9b2b..0200984c0ec85 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -314,7 +314,6 @@ class ArcticAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index beb22995a0719..ee4a1dbd6df94 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -189,7 +189,6 @@ class BaiChuanAttention(nn.Module): else: self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index 0143e140af265..4bccee7521749 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -127,11 +127,11 @@ class BailingAttention(nn.Module): prefix=f"{prefix}.dense", ) - self.rotary_dim = getattr(config, "rotary_dim", self.head_dim) + rotary_dim = getattr(config, "rotary_dim", self.head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.rotary_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 00d742f84ef79..22631bbc5489b 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -178,14 +178,11 @@ class BambaAttentionDecoderLayer(nn.Module): self.scaling = self.head_dim**-0.5 self.max_position_embeddings = max_position_embeddings - if hasattr(config, "attn_rotary_emb"): - rotary_dim = config.attn_rotary_emb # for backward compatibility - else: - rotary_dim = self.head_dim # default + rotary_dim = getattr(config, "attn_rotary_emb", self.head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=rotary_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index dfc05a366b286..176c5cd14c6e2 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -314,7 +314,6 @@ class ChameleonAttention(nn.Module): self.k_norm = ChameleonLayerNorm((self.num_kv_heads, self.head_dim)) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 3d485fdd0a2e1..26181d1c9bae4 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -99,13 +99,16 @@ class GLMAttention(nn.Module): # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141 rope_ratio = getattr(config, "rope_ratio", 1.0) max_positions = getattr(config, "seq_length", 8192) - rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio} + rope_parameters = { + "rope_type": "default", + "rope_theta": 10000 * rope_ratio, + "partial_rotary_factor": 0.5, + } # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False, # which is equivalent to is_neox_style=True is_neox_style = not config.original_rope self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim // 2, max_position=max_positions, rope_parameters=rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index f837502c468f1..63a93eaa2d4f3 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -175,7 +175,6 @@ class CohereAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=False, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 8de793941b8c3..06cc92ee88180 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -42,9 +42,10 @@ class GteNewModelConfig(VerifyAndUpdateConfig): config.hidden_act = "geglu" head_dim = config.hidden_size // config.num_attention_heads + rotary_dim = getattr(config, "rotary_emb_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim config.rotary_kwargs = { "head_size": head_dim, - "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, "rope_parameters": config.rope_parameters, } @@ -77,9 +78,11 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig): if not model_config.enforce_eager: max_position = round_up(max_position, 8) + rotary_dim = getattr(config, "rotary_emb_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim + config.rotary_kwargs = { "head_size": head_dim, - "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": max_position, "rope_parameters": config.rope_parameters, } @@ -113,12 +116,10 @@ class NomicBertModelConfig(VerifyAndUpdateConfig): config.num_hidden_layers = config.n_layer head_dim = config.hidden_size // config.num_attention_heads - rotary_emb_dim = int(head_dim * config.rotary_emb_fraction) max_trained_positions = getattr(config, "max_trained_positions", 2048) config.rotary_kwargs = { "head_size": head_dim, - "rotary_dim": rotary_emb_dim, "max_position": max_trained_positions, "rope_parameters": config.rope_parameters, } @@ -240,9 +241,10 @@ class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig): config.hidden_act = "geglu" head_dim = config.hidden_size // config.num_attention_heads + rotary_dim = getattr(config, "rotary_emb_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim config.rotary_kwargs = { "head_size": head_dim, - "rotary_dim": getattr(config, "rotary_emb_dim", head_dim), "max_position": config.max_position_embeddings, "rope_parameters": config.rope_parameters, } diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 946baffc8817a..db4fe61b0d85f 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -222,7 +222,6 @@ class DbrxAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 0b6513789aea8..a9fa76deecbd2 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -156,7 +156,6 @@ class DeepseekAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) @@ -499,7 +498,6 @@ class DeepseekV2Attention(nn.Module): self.rotary_emb = get_rope( qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=False, @@ -1018,7 +1016,6 @@ class DeepseekV2MLAAttention(nn.Module): self.rotary_emb = get_rope( qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=False, @@ -1038,7 +1035,6 @@ class DeepseekV2MLAAttention(nn.Module): if self.is_v32: self.indexer_rope_emb = get_rope( qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py index 3beee9f864634..870a37039f151 100644 --- a/vllm/model_executor/models/dots1.py +++ b/vllm/model_executor/models/dots1.py @@ -250,7 +250,6 @@ class Dots1Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index 278ba45e9684c..fbbd31a485383 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -288,7 +288,6 @@ class Ernie4_5_MoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=False, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index acf651ed24988..039e7cf68e52b 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -167,7 +167,6 @@ class ExaoneAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index cb710a7ec5cf9..b4b7a798fd050 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -176,7 +176,6 @@ class Exaone4Attention(nn.Module): set_default_rope_theta(config, default_theta=1000000) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 32d9e7b925597..7cdfcae0e718d 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -167,7 +167,6 @@ class FalconAttention(nn.Module): max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index a1c1263f8d724..bfb6b1a1f160d 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -242,14 +242,11 @@ class FalconH1AttentionDecoderLayer(nn.Module): self.scaling = self.head_dim**-0.5 self.max_position_embeddings = max_position_embeddings - if hasattr(config, "attn_rotary_emb"): - rotary_dim = config.attn_rotary_emb # for backward compatibility - else: - rotary_dim = self.head_dim # default + rotary_dim = getattr(config, "attn_rotary_emb", self.head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=rotary_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index dd5a74c8ed005..7304a728067f4 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -174,7 +174,6 @@ class GemmaAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index cb36e04824588..fe6ec5ff83dec 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -152,7 +152,6 @@ class Gemma2Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 73176eba95ed5..40f6d100c767e 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -176,7 +176,6 @@ class Gemma3Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index f4427c9fd1d10..4d446f51c2ecb 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -384,7 +384,6 @@ class Gemma3nAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 9adfa942b99fa..2cd11e66c752b 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -81,7 +81,6 @@ class Glm4Attention(nn.Module): config.rope_parameters.setdefault("partial_rotary_factor", 0.5) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.head_dim = head_dim or hidden_size // self.total_num_heads - self.rotary_dim = self.head_dim self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 @@ -103,7 +102,6 @@ class Glm4Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.rotary_dim, max_position=max_position, rope_parameters=config.rope_parameters, is_neox_style=False, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index de091f03e881c..786482d77a1d2 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -678,9 +678,9 @@ class Glm4vVisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.blocks = nn.ModuleList( [ diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py index 8cae5ee425e4d..541d3b2beff83 100644 --- a/vllm/model_executor/models/glm4_moe.py +++ b/vllm/model_executor/models/glm4_moe.py @@ -285,7 +285,6 @@ class Glm4MoeAttention(nn.Module): config.rope_parameters.setdefault("partial_rotary_factor", 0.5) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index f0a34c47da54c..f32ac2639435c 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -95,12 +95,13 @@ class GPTJAttention(nn.Module): scaling = self.head_size**-0.5 assert getattr(config, "rotary", True) assert config.rotary_dim % 2 == 0 + rope_parameters = getattr(config, "rope_parameters", {}) + rope_parameters["partial_rotary_factor"] = config.rotary_dim / self.head_size max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, - rotary_dim=config.rotary_dim, max_position=max_position_embeddings, - rope_parameters=getattr(config, "rope_parameters", None), + rope_parameters=rope_parameters, is_neox_style=False, ) self.attn = Attention( diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 212d605c17285..c4d11b488f38b 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -92,7 +92,6 @@ class GPTNeoXAttention(nn.Module): max_position_embeddings = getattr(config, "max_position_embeddings", 8192) self.rotary_emb = get_rope( self.head_size, - rotary_dim=self.head_size, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index cff16b7a7a8cd..6a92cf1533213 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -67,7 +67,6 @@ class OAIAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=config.max_position_embeddings, dtype=torch.float32, rope_parameters={ diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 76519c4660f15..82c945f5ad5ec 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -160,7 +160,6 @@ class GraniteAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index b038400a1262a..0b1064b6343e3 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -190,7 +190,6 @@ class GraniteMoeAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 1d9c2f5df4a55..3434716b83789 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -271,7 +271,6 @@ class GraniteMoeHybridAttention(nn.Module): if config.position_embedding_type == "rope": self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 6f62a1d11e52e..0a2e5cf39ffd8 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -181,7 +181,6 @@ class Grok1Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index ccdfa3fe175f1..0e82e84c4edbe 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -199,7 +199,6 @@ class HunYuanAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, @@ -305,7 +304,6 @@ class HunYuanCrossAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index c79934e121447..3ca8864618628 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -140,7 +140,6 @@ class InternLM2Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py index a4a994f97a2f8..142ad3d6d1d1a 100644 --- a/vllm/model_executor/models/lfm2.py +++ b/vllm/model_executor/models/lfm2.py @@ -143,7 +143,6 @@ class Lfm2Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py index c8669de72dd09..70804e0a843e8 100644 --- a/vllm/model_executor/models/lfm2_moe.py +++ b/vllm/model_executor/models/lfm2_moe.py @@ -236,7 +236,6 @@ class Lfm2MoeAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 167dfbca248ce..3507a2bc66c17 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -259,7 +259,6 @@ class LlamaAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=getattr(config, "rope_parameters", None), is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 423be45e80149..7b3da3e10ab8a 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -243,7 +243,6 @@ class Llama4Attention(nn.Module): self.rotary_emb = ( get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 67c462f4b25c4..f104018d3aa6c 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -277,7 +277,6 @@ class MiniCPMAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 0a2bcbd7f6084..c7a54cea21544 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -120,7 +120,6 @@ class MiniCPM3Attention(nn.Module): self.rotary_emb = get_rope( self.qk_rope_head_dim, - rotary_dim=self.qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index 3e6a9add9ec49..ee19288ae6852 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -199,9 +199,13 @@ class MiniMaxM2Attention(nn.Module): prefix=f"{prefix}.o_proj", ) + if ( + rope_parameters is not None + and "partial_rotary_factor" not in rope_parameters + ): + rope_parameters["partial_rotary_factor"] = rotary_dim / self.head_dim self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 390de78cc27b4..4bfe3c391c26f 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -187,7 +187,6 @@ class MiniMaxText01Attention(nn.Module): num_heads: int, head_dim: int, num_kv_heads: int, - rotary_dim: int, max_position: int = 4096 * 32, rope_parameters: dict | None = None, sliding_window: int | None = None, @@ -245,7 +244,6 @@ class MiniMaxText01Attention(nn.Module): ) self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=rotary_dim, max_position=max_position, rope_parameters=rope_parameters, is_neox_style=True, @@ -290,6 +288,8 @@ class MiniMaxText01DecoderLayer(nn.Module): head_dim = getattr(config, "head_dim", None) if head_dim is None: head_dim = config.hidden_size // config.num_attention_heads + rotary_dim = getattr(config, "rotary_dim", head_dim) + config.rope_parameters["partial_rotary_factor"] = rotary_dim / head_dim if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int): max_position_embeddings = min( config.max_position_embeddings, config.max_model_len @@ -321,9 +321,6 @@ class MiniMaxText01DecoderLayer(nn.Module): hidden_size=self.hidden_size, num_heads=config.num_attention_heads, head_dim=head_dim, - rotary_dim=config.rotary_dim - if hasattr(config, "rotary_dim") - else head_dim, num_kv_heads=config.num_key_value_heads, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 50ec57e7a8053..e170c530ca29f 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -206,7 +206,6 @@ class MixtralAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index e944c0ee38aa1..fe963cc6644fb 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -295,11 +295,11 @@ class Llama4VisionAttention(nn.Module): rope_parameters = { "rope_type": "mllama4", "rope_theta": config.rope_parameters["rope_theta"], + "partial_rotary_factor": 0.5, } self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=config.hidden_size // config.num_attention_heads // 2, # number of image patches max_position=(config.image_size // config.patch_size) ** 2, rope_parameters=rope_parameters, diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index be36f761c63aa..4655ffa7b2f61 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -105,7 +105,6 @@ class ModernBertAttention(nn.Module): self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=self.head_dim, max_position=config.max_position_embeddings, rope_parameters=rope_parameters, dtype=torch.float16, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index a6cd9ad16c188..71c6b1aa2e814 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -433,7 +433,6 @@ class MolmoAttention(nn.Module): # Rotary embeddings. self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index bf83ee5e42a15..21605015c470b 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -199,7 +199,6 @@ class NemotronAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 734fbc60709fa..19a942a5277cc 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -118,7 +118,6 @@ class DeciLMAttention(LlamaAttention): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 3bbb4dd242262..dd7c27f10c531 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -102,7 +102,6 @@ class OlmoAttention(nn.Module): # Rotary embeddings. self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 88e9c2d8541a1..b030c94b54cd5 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -146,7 +146,6 @@ class Olmo2Attention(nn.Module): rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 1376583a99725..a5a926151c5c9 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -171,7 +171,6 @@ class OlmoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py index bddd9fa50957a..47abd7bf0b68a 100644 --- a/vllm/model_executor/models/openpangu.py +++ b/vllm/model_executor/models/openpangu.py @@ -352,7 +352,6 @@ class OpenPanguMLAAttention(nn.Module): } self.rotary_emb = get_rope( qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, is_neox_style=False, @@ -525,7 +524,6 @@ class OpenPanguEmbeddedAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=is_neox_style, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 544a44ed54681..9d9066c4ba619 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -135,7 +135,6 @@ class OrionAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index dcae92ed20881..829148b4c1fb7 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -166,7 +166,6 @@ class OuroAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=config.rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 8f26c68720a5c..b644603c5baa1 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -134,7 +134,6 @@ class PersimmonAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 253fbbc41330c..e01e9d47c545c 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -84,19 +84,18 @@ class PhiAttention(nn.Module): prefix: str = "", ): super().__init__() - self.total_num_heads = config.num_attention_heads self.hidden_size = config.hidden_size - self.head_size = self.hidden_size // self.total_num_heads + self.head_size = self.hidden_size // config.num_attention_heads tensor_model_parallel_world_size = get_tensor_model_parallel_world_size() - assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = self.total_num_heads // tensor_model_parallel_world_size + assert config.num_attention_heads % tensor_model_parallel_world_size == 0 + self.num_heads = config.num_attention_heads // tensor_model_parallel_world_size # pylint: disable=C0103 self.qkv_proj = QKVParallelLinear( self.hidden_size, self.head_size, - self.total_num_heads, + config.num_attention_heads, bias=True, quant_config=quant_config, prefix=f"{prefix}.qkv_proj", @@ -109,13 +108,10 @@ class PhiAttention(nn.Module): ) scaling = self.head_size**-0.5 - rotary_dim = config.hidden_size // config.num_attention_heads - assert rotary_dim % 2 == 0 max_position_embeddings = getattr(config, "max_position_embeddings", 2048) self.rotary_emb = get_rope( self.head_size, - rotary_dim=rotary_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 49530776f8903..14f73d0c64586 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -352,7 +352,6 @@ class PhiMoEAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 472de5590dcf8..6765ee0c5779c 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -574,7 +574,6 @@ class Plamo2AttentionMixer(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py index 4aeb9d432dcc6..3557104d905cb 100644 --- a/vllm/model_executor/models/plamo3.py +++ b/vllm/model_executor/models/plamo3.py @@ -179,7 +179,6 @@ class Plamo3AttentionMixer(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 12285cf9c1968..492ba2fb12145 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -114,7 +114,6 @@ class QWenAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index f5501bae78418..3af4a49cd77cc 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -164,7 +164,6 @@ class Qwen2Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 3cc3a3a7873c6..fba06e34f6227 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -624,9 +624,9 @@ class Qwen2_5_VisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.attn_backend = get_vit_attn_backend( diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index cbc618f1abd08..2750f1864b81a 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -244,7 +244,6 @@ class Qwen2MoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 608e90337f452..2c4ac2f8efff1 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -621,9 +621,9 @@ class Qwen2VisionTransformer(nn.Module): head_dim = embed_dim // num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.blocks = nn.ModuleList( diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 7d2b3e5f9bc79..0d0da52ed7382 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -111,7 +111,6 @@ class Qwen3Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index c6984dc37c51c..0be81ecc7dd3a 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -269,7 +269,6 @@ class Qwen3MoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, dual_chunk_attention_config=dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index dd64e3983e381..6a5447ad0fed4 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -747,7 +747,6 @@ class Qwen3NextAttention(nn.Module): self.rotary_emb = get_rope( head_size=self.head_dim, - rotary_dim=self.head_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, dual_chunk_attention_config=self.dual_chunk_attention_config, diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index dbe7bcd07576b..635c3bfdc65c7 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -333,9 +333,9 @@ class Qwen3Omni_VisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.blocks = nn.ModuleList( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index f8e0ea6284994..fcd58c4d33cd7 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -340,9 +340,9 @@ class Qwen3_VisionTransformer(nn.Module): head_dim = self.hidden_size // self.num_heads self.rotary_pos_emb = get_rope( head_size=head_dim, - rotary_dim=head_dim // 2, max_position=8192, is_neox_style=True, + rope_parameters={"partial_rotary_factor": 0.5}, ) self.merger = Qwen3_VisionPatchMerger( diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index 267c60157506d..f25223c782552 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -161,7 +161,6 @@ class SeedOssAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 7bef56110cab7..964aa902704b3 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -160,7 +160,6 @@ class SolarAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, ) diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index e879599ad3ead..ea4342882feb4 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -148,7 +148,6 @@ class StablelmAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.config.max_position_embeddings, rope_parameters=self.config.rope_parameters, ) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 46422f303ff43..569ca9b082cfa 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -112,7 +112,6 @@ class Starcoder2Attention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=self.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py index 077cce84a98dd..7077f1a22e8d7 100644 --- a/vllm/model_executor/models/step3_text.py +++ b/vllm/model_executor/models/step3_text.py @@ -196,7 +196,6 @@ class Step3TextAttention(nn.Module): ) self.rotary_emb = get_rope( self.head_dim, - rotary_dim=self.head_dim, max_position=max_position_embedding, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 653b5b9beef7b..fe157887eea91 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -230,7 +230,6 @@ class Zamba2Attention(nn.Module): if config.use_mem_rope: self.rotary_emb = get_rope( head_size=self.attention_head_dim, - rotary_dim=self.attention_head_dim, max_position=config.max_position_embeddings, rope_parameters=config.rope_parameters, is_neox_style=True, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index d761802da9403..fb88c62dc5b23 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -306,8 +306,13 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: """Provide backwards compatibility for RoPE.""" from vllm.config.utils import getattr_iter - rope_theta_names = ("rope_theta", "rotary_emb_base") - rope_theta = getattr_iter(config, rope_theta_names, None) + # Older custom models may use non-standard field names + # which need patching for both Transformers v4 and v5. + names = ["rope_theta", "rotary_emb_base"] + rope_theta = getattr_iter(config, names, None, warn=True) + names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"] + partial_rotary_factor = getattr_iter(config, names, None, warn=True) + if Version(version("transformers")) < Version("5.0.0.dev0"): # Transformers v4 installed, legacy config fields may be present if (rope_scaling := getattr(config, "rope_scaling", None)) is not None: @@ -316,14 +321,18 @@ def patch_rope_parameters(config: PretrainedConfig) -> None: if not hasattr(config, "rope_parameters"): config.rope_parameters = {"rope_type": "default"} config.rope_parameters["rope_theta"] = rope_theta - partial_rotary_factor_names = ("partial_rotary_factor", "rotary_pct") - partial_rotary_factor = getattr_iter(config, partial_rotary_factor_names, None) if partial_rotary_factor is not None: if not hasattr(config, "rope_parameters"): config.rope_parameters = {"rope_type": "default"} config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor elif rope_theta is not None or hasattr(config, "rope_parameters"): # Transformers v5 installed + # Patch these fields in case they used non-standard names + if rope_theta is not None: + config.rope_theta = rope_theta + if partial_rotary_factor is not None: + config.partial_rotary_factor = partial_rotary_factor + # Standardize and validate RoPE parameters config.standardize_rope_params() config.validate_rope() From 90d6cf921fe623524f618740616a6cf494d4a8df Mon Sep 17 00:00:00 2001 From: Xingyu Liu <38244988+charlotte12l@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:00:15 -0800 Subject: [PATCH 04/57] [BugFix][MM]support VLLM_RANDOMIZE_DP_DUMMY_INPUTS (#30472) Signed-off-by: Xingyu Liu Co-authored-by: Cyrus Leung --- vllm/v1/worker/gpu_model_runner.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0e2bf9df9a18f..40c8059f90d34 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import functools import gc import itertools import time @@ -3892,19 +3893,21 @@ class GPUModelRunner( return {} @contextmanager - def maybe_randomize_inputs(self, input_ids: torch.Tensor): + def maybe_randomize_inputs( + self, input_ids: torch.Tensor | None, inputs_embeds: torch.Tensor | None + ): """ Randomize input_ids if VLLM_RANDOMIZE_DP_DUMMY_INPUTS is set. This is to help balance expert-selection - during profile_run - during DP rank dummy run """ + dp_size = self.vllm_config.parallel_config.data_parallel_size randomize_inputs = envs.VLLM_RANDOMIZE_DP_DUMMY_INPUTS and dp_size > 1 if not randomize_inputs: yield - else: - import functools + elif input_ids is not None: @functools.cache def rand_input_ids() -> torch.Tensor: @@ -3912,13 +3915,27 @@ class GPUModelRunner( self.input_ids.gpu, low=0, high=self.model_config.get_vocab_size(), - dtype=input_ids.dtype, ) - logger.debug_once("Randomizing dummy data for DP Rank") + logger.debug_once("Randomizing dummy input_ids for DP Rank") input_ids.copy_(rand_input_ids()[: input_ids.size(0)], non_blocking=True) yield input_ids.fill_(0) + else: + + @functools.cache + def rand_inputs_embeds() -> torch.Tensor: + return torch.randn_like( + self.inputs_embeds.gpu, + ) + + assert inputs_embeds is not None + logger.debug_once("Randomizing dummy inputs_embeds for DP Rank") + inputs_embeds.copy_( + rand_inputs_embeds()[: inputs_embeds.size(0)], non_blocking=True + ) + yield + inputs_embeds.fill_(0) def _get_mm_dummy_batch( self, @@ -4167,7 +4184,7 @@ class GPUModelRunner( num_tokens_across_dp[:] = num_tokens_padded with ( - self.maybe_randomize_inputs(input_ids), + self.maybe_randomize_inputs(input_ids, inputs_embeds), set_forward_context( attn_metadata, self.vllm_config, From 0efd9f867c6a7617fbcb8a335677bb8295f1bcb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 11 Dec 2025 22:06:51 +0100 Subject: [PATCH 05/57] [Core] Whisper Enable Encoder Batching (#29421) Signed-off-by: NickLucche --- vllm/config/model.py | 5 +++ vllm/config/vllm.py | 30 +++++---------- vllm/model_executor/models/whisper.py | 17 +++++++-- vllm/v1/core/encoder_cache_manager.py | 53 +++++++++++++++++++++++++++ vllm/v1/core/sched/scheduler.py | 7 +++- 5 files changed, 87 insertions(+), 25 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 03140c17fb50e..59e9689567bd2 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -539,6 +539,11 @@ class ModelConfig: self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + + if self.is_encoder_decoder: + self.mm_processor_cache_gb = 0 + logger.info("Encoder-decoder model detected, disabling mm processor cache.") + # Init multimodal config if needed if self._model_info.supports_multimodal: if ( diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 0e75daf0d722c..b5f8f916de438 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -750,27 +750,17 @@ class VllmConfig: # TODO: Move after https://github.com/vllm-project/vllm/pull/26847 lands self._set_compile_ranges() - if self.model_config and self.model_config.is_encoder_decoder: - from vllm.multimodal import MULTIMODAL_REGISTRY - - self.scheduler_config.max_num_encoder_input_tokens = ( - MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) + if ( + self.model_config + and self.model_config.architecture == "WhisperForConditionalGeneration" + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" + ): + logger.warning( + "Whisper is known to have issues with " + "forked workers. If startup is hanging, " + "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " + "to 'spawn'." ) - logger.debug( - "Encoder-decoder model detected: setting " - "`max_num_encoder_input_tokens` to encoder length (%s)", - self.scheduler_config.max_num_encoder_input_tokens, - ) - if ( - self.model_config.architecture == "WhisperForConditionalGeneration" - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" - ): - logger.warning( - "Whisper is known to have issues with " - "forked workers. If startup is hanging, " - "try setting 'VLLM_WORKER_MULTIPROC_METHOD' " - "to 'spawn'." - ) if ( self.kv_events_config is not None diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index b2feff1335151..b513e3513b2e2 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -522,6 +522,7 @@ class WhisperEncoder(nn.Module): def forward(self, input_features: torch.Tensor | list[torch.Tensor]): hidden_states = [] + input_is_batched = False for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) embeds = nn.functional.gelu(self.conv2(embeds)) @@ -530,7 +531,13 @@ class WhisperEncoder(nn.Module): embeds.dtype ) hidden_states.append(embeds) - hidden_states = torch.cat(hidden_states) + input_is_batched = embeds.ndim > 2 + # Input to MHA must be B x T x D + if input_is_batched: + # Models using WhisperEncoder may handle batching internally. + hidden_states = torch.cat(hidden_states) + else: + hidden_states = torch.stack(hidden_states, dim=0) for encoder_layer in self.layers: hidden_states = encoder_layer(hidden_states) @@ -603,8 +610,7 @@ class WhisperModel(nn.Module): positions: torch.Tensor, encoder_outputs: list[torch.Tensor], ) -> torch.Tensor: - assert len(encoder_outputs) in (0, 1) - enc_states = encoder_outputs[0] if len(encoder_outputs) == 1 else None + enc_states = torch.cat(encoder_outputs, dim=0) if len(encoder_outputs) else None decoder_outputs = self.decoder( input_ids=input_ids, positions=positions, @@ -913,7 +919,10 @@ class WhisperForConditionalGeneration( def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: # Required as part of SupportsMultiModal interface. audio_input = self._parse_and_validate_audio_input(**kwargs) - return [self.model.get_encoder_outputs(audio_input["input_features"])] + # Split concatenated encoder outputs into one tensor per audio input + enc_output = self.model.get_encoder_outputs(audio_input["input_features"]) + # The assumption is we can only process whole mm items (audios) + return enc_output.unbind(dim=0) def embed_input_ids( self, diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 3959e9a59a53b..50f738713590b 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -341,3 +341,56 @@ def compute_mm_encoder_budget( ) return encoder_compute_budget, encoder_cache_size + + +# NOTE (NickLucche): Temporary implementation for encoder-decoder models that only +# use the manager for scheduling purposes. Encoder-decoder models will eventually +# utilize the cache and this class will fold into EncoderCacheManager, as +# differences with MM models shrink. +class EncoderDecoderCacheManager(EncoderCacheManager): + def __init__(self, cache_size: int): + self.cache_size = cache_size + self.num_free_slots = cache_size + self.freed: list[str] = [] + + def check_and_update_cache(self, request: Request, input_id: int) -> bool: + return False + + def can_allocate( + self, + request: Request, + input_id: int, + encoder_compute_budget: int, + num_tokens_to_schedule: int, + ) -> bool: + num_tokens = request.get_num_encoder_tokens(input_id) + # Not enough compute budget + if num_tokens > encoder_compute_budget: + return False + + num_tokens += num_tokens_to_schedule + # Enough free slots + return num_tokens <= self.num_free_slots + + def allocate(self, request: Request, input_id: int) -> None: + num_encoder_tokens = request.get_num_encoder_tokens(input_id) + self.num_free_slots -= num_encoder_tokens + + mm_hash = request.mm_features[input_id].identifier + self.freed.append(mm_hash) + + def free(self, request: Request) -> None: + for input_id in range(len(request.mm_features)): + self.free_encoder_input(request, input_id) + + def get_cached_input_ids(self, request: Request) -> set[int]: + return set(range(len(request.mm_features))) + + def get_freed_mm_hashes(self) -> list[str]: + freed = self.freed + self.freed = [] + return freed + + def free_encoder_input(self, request: Request, input_id: int) -> None: + num_tokens = request.get_num_encoder_tokens(input_id) + self.num_free_slots += num_tokens diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index c3d504f2e72c3..a9ce6e63cc775 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -27,6 +27,7 @@ from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.v1.core.encoder_cache_manager import ( EncoderCacheManager, + EncoderDecoderCacheManager, compute_encoder_budget, ) from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager @@ -181,7 +182,11 @@ class Scheduler(SchedulerInterface): # NOTE: For the models without encoder (e.g., text-only models), # the encoder cache will not be initialized because cache size is 0 # for these models. - self.encoder_cache_manager = EncoderCacheManager(cache_size=encoder_cache_size) + self.encoder_cache_manager = ( + EncoderDecoderCacheManager(cache_size=encoder_cache_size) + if self.is_encoder_decoder + else EncoderCacheManager(cache_size=encoder_cache_size) + ) speculative_config = vllm_config.speculative_config self.use_eagle = False From 3efdc3feaef01d45fb54650163da480bdf2f0ce4 Mon Sep 17 00:00:00 2001 From: ioana ghiban Date: Thu, 11 Dec 2025 23:03:29 +0100 Subject: [PATCH 06/57] [Docs][CPU backend] Add pre-built Arm CPU Docker images (#30491) Signed-off-by: Ioana Ghiban --- .../installation/cpu.arm.inc.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index 8ec18bcb826ec..ad9c7d9ef21be 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -100,7 +100,23 @@ Testing has been conducted on AWS Graviton3 instances for compatibility. # --8<-- [end:build-wheel-from-source] # --8<-- [start:pre-built-images] -Currently, there are no pre-built Arm CPU images. +See [Using Docker](../../deployment/docker.md) for instructions on using the official Docker image. + +Stable vLLM Docker images are being pre-built for Arm from version 0.12.0. Available image tags are here: [https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo). +Please replace `` in the command below with a specific version string (e.g., `0.12.0`). + +```bash +docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v +``` + +You can also access the latest code with Docker images. These are not intended for production use and are meant for CI and testing only. They will expire after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +```bash +export VLLM_COMMIT=6299628d326f429eba78736acb44e76749b281f5 # use full commit hash from the main branch +docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT}-arm64-cpu +``` # --8<-- [end:pre-built-images] # --8<-- [start:build-image-from-source] From c817b1415121cf88178af1e4e78f651d802df4da Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:28:34 -0500 Subject: [PATCH 07/57] [Perf] Optimize deepgemm experts initialization, 3.9% TTFT improvement (#30494) Signed-off-by: yewentao256 Co-authored-by: li-jinpeng <3332126450@qq.com> Co-authored-by: youkaichao --- .../layers/fused_moe/deep_gemm_utils.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py index 6cca954123274..57d303cd53fef 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py @@ -84,10 +84,16 @@ def _fwd_kernel_ep_scatter_1( m_indices_start_ptr = m_indices + cur_expert_start off_expert = tl.arange(0, BLOCK_E) + # any rows in the per-expert aligned region that do not correspond to + # real tokens are left untouched here and should remain initialized to + # -1 so DeepGEMM can skip them for start_m in tl.range(0, cur_expert_token_num, BLOCK_E, num_stages=4): + offs = start_m + off_expert + mask = offs < cur_expert_token_num tl.store( - m_indices_start_ptr + start_m + off_expert, + m_indices_start_ptr + offs, cur_expert, + mask=mask, ) @@ -366,12 +372,17 @@ def deepgemm_moe_permute( (M_sum, H // block_k), device=device, dtype=torch.float32 ) - maybe_has_empty_blocks = (expert_tokens_meta is None) or ( - expert_tokens_meta.expert_num_tokens_cpu is None + # DeepGEMM uses negative values in m_indices (here expert_ids) to mark + # completely invalid / padded blocks that should be skipped. We always + # initialize expert_ids to -1 so any row that is not explicitly written + # by the scatter kernel will be treated as invalid and skipped by + # DeepGEMM's scheduler. + expert_ids = torch.full( + (M_sum,), + fill_value=-1, + device=device, + dtype=torch.int32, ) - expert_ids_init = torch.zeros if maybe_has_empty_blocks else torch.empty - - expert_ids = expert_ids_init((M_sum), device=device, dtype=torch.int32) inv_perm = torch.empty(topk_ids.shape, device=device, dtype=torch.int32) expert_num_tokens = None From 61249b177de1566027fc74e9b51b45a4c973eb47 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:43:41 -0500 Subject: [PATCH 08/57] [Refactor] Remove useless syncwarp (#30510) Signed-off-by: yewentao256 --- csrc/moe/grouped_topk_kernels.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index 47ee5f021eb4a..5fa367abd96f5 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -481,8 +481,6 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias, largest = value; } } - - __syncwarp(); // Ensure all threads have valid data before reduction // Get the top2 warpwise T max1 = cg::reduce(tile, largest, cg::greater()); @@ -589,7 +587,6 @@ __global__ void group_idx_and_topk_idx_kernel( int pre_count_equal_to_top_value = 0; // Use loop to find the largset top_group while (count_equal_to_top_value < target_num_min) { - __syncwarp(); // Ensure all threads have valid data before reduction topk_group_value = cg::reduce(tile, value, cg::greater()); if (value == topk_group_value) { value = neg_inf(); @@ -644,10 +641,8 @@ __global__ void group_idx_and_topk_idx_kernel( } } queue.done(); - __syncwarp(); // Get the topk_idx queue.dumpIdx(s_topk_idx); - __syncwarp(); } // Load the valid score value From a00d88973daf9a151ecbd4c740ca99645715b9df Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Thu, 11 Dec 2025 16:59:40 -0600 Subject: [PATCH 09/57] [EPLB] Support EPLB w/ NVFP4 (#29804) Signed-off-by: Andrew Briand Co-authored-by: Andrew Briand --- .../test_eplb_fused_moe_layer_dep_nvfp4.py | 276 ++++++++++++++++++ .../layers/quantization/modelopt.py | 26 +- .../quantization/utils/flashinfer_fp4_moe.py | 79 +++++ 3 files changed, 376 insertions(+), 5 deletions(-) create mode 100644 tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py new file mode 100644 index 0000000000000..951b692e1edaf --- /dev/null +++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py @@ -0,0 +1,276 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Test that the interaction between EPLB and FusedMoE Layer is okay for DP w/ NVFP4 + +from dataclasses import dataclass + +import pytest +import torch + +from tests.kernels.moe.utils import make_test_quant_config +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace +from vllm.distributed.parallel_state import ( + ensure_model_parallel_initialized, + get_dp_group, +) +from vllm.forward_context import set_forward_context +from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptNvFp4Config, + ModelOptNvFp4FusedMoE, +) + +from .eplb_utils import distributed_run, set_env_vars_and_device + + +@dataclass +class TestConfig: + num_layers: int + num_experts: int + num_local_experts: int + num_topk: int + hidden_size: int + intermediate_size: int + num_tokens: int + + +def make_fused_moe_layer( + rank: int, + layer_idx: int, + test_config: TestConfig, +) -> FusedMoE: + quant_config = None + + device = torch.device(f"cuda:{rank}") + + quant_config = ModelOptNvFp4Config( + is_checkpoint_nvfp4_serialized=True, + kv_cache_quant_algo=None, + exclude_modules=[], + ) + + fml = FusedMoE( + num_experts=test_config.num_experts, + top_k=test_config.num_topk, + hidden_size=test_config.hidden_size, + intermediate_size=test_config.intermediate_size, + prefix=f"dummy_layer_{layer_idx}", + activation="silu", + is_act_and_mul=True, + params_dtype=torch.bfloat16, + quant_config=quant_config, + ) + + nvfp4_fused_moe = ModelOptNvFp4FusedMoE(quant_config, fml) + nvfp4_fused_moe.create_weights( + fml, + test_config.num_local_experts, + test_config.hidden_size, + test_config.intermediate_size, + params_dtype=torch.uint8, + global_num_experts=test_config.num_experts, + ) + + fml = fml.to(device) + w1_q, w2_q, quant_config = make_test_quant_config( + test_config.num_local_experts, + test_config.intermediate_size, + test_config.hidden_size, + in_dtype=torch.bfloat16, + quant_dtype="nvfp4", + block_shape=None, + per_act_token_quant=False, + ) + + fml.w13_weight.data = w1_q + fml.w2_weight.data = w2_q + + fml.w2_input_scale.data = torch.randn_like(fml.w2_input_scale.data) / 5 + fml.w13_input_scale.data = torch.randn_like(fml.w13_input_scale.data) / 5 + fml.w2_weight_scale_2.data = torch.randn_like(fml.w2_weight_scale_2.data) / 5 + fml.w13_weight_scale_2.data = torch.randn_like(fml.w13_weight_scale_2.data) / 5 + fml.w2_weight_scale.data = ( + torch.randn(fml.w2_weight_scale.data.shape, device=device) / 5 + ).to(fml.w2_weight_scale.data.dtype) + fml.w13_weight_scale.data = ( + torch.randn(fml.w13_weight_scale.data.shape, device=device) / 5 + ).to(fml.w13_weight_scale.data.dtype) + + nvfp4_fused_moe.process_weights_after_loading(fml) + + fml.maybe_init_modular_kernel() + + return fml + + +def _test_eplb_fml(env, world_size: int, test_config: TestConfig): + set_env_vars_and_device(env) + + vllm_config = VllmConfig() + vllm_config.parallel_config.data_parallel_size = world_size + vllm_config.parallel_config.enable_expert_parallel = True + + with set_current_vllm_config(vllm_config): + ensure_model_parallel_initialized( + tensor_model_parallel_size=1, pipeline_model_parallel_size=1 + ) + + ep_group = get_dp_group().cpu_group + ep_rank = torch.distributed.get_rank() + + device = torch.device(f"cuda:{ep_rank}") + + fml_layers = [ + make_fused_moe_layer(ep_rank, layer_idx, test_config).to(device) + for layer_idx in range(test_config.num_layers) + ] + rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers] + + hidden_states = [] + router_logits = [] + for layer_idx in range(test_config.num_layers): + hidden_states.append( + torch.randn( + (test_config.num_tokens, test_config.hidden_size), + dtype=torch.bfloat16, + device=device, + ) + ) + router_logits.append( + torch.randn( + (test_config.num_tokens, test_config.num_experts), + dtype=torch.bfloat16, + device=device, + ) + ) + + out_before_shuffle = [] + with set_forward_context( + {}, + num_tokens=test_config.num_tokens, + num_tokens_across_dp=torch.tensor( + [test_config.num_tokens] * world_size, device="cpu", dtype=torch.int + ), + vllm_config=vllm_config, + ): + for lidx, fml in enumerate(fml_layers): + out_before_shuffle.append( + fml(hidden_states[lidx].clone(), router_logits[lidx].clone()) + ) + + indices = torch.zeros( + test_config.num_layers, test_config.num_experts, dtype=torch.long + ) + for lidx in range(test_config.num_layers): + indices[lidx] = torch.Tensor(range(test_config.num_experts)) + + shuffled_indices = torch.zeros_like(indices) + for lidx in range(test_config.num_layers): + shuffled_indices[lidx] = torch.randperm(test_config.num_experts) + + rearrange_expert_weights_inplace( + indices, + shuffled_indices, + rank_expert_weights, + ep_group, + is_profile=False, + ) + + num_global_experts = test_config.num_experts + + logical_to_physical_map_list = [] + for lidx, fml in enumerate(fml_layers): + physical_to_logical_map = shuffled_indices[lidx].to(device) + logical_to_physical_map = torch.empty( + (num_global_experts,), dtype=torch.int32, device=device + ) + logical_to_physical_map[physical_to_logical_map] = torch.arange( + 0, num_global_experts, dtype=torch.int32, device=device + ) + logical_to_physical_map_list.append( + logical_to_physical_map.reshape(num_global_experts, 1) + ) + + logical_to_physical_map = torch.stack(logical_to_physical_map_list) + + for lidx, fml in enumerate(fml_layers): + logical_replica_count = torch.ones( + (test_config.num_layers, num_global_experts), + dtype=torch.int32, + device=device, + ) + fml.enable_eplb = True + fml.set_eplb_state( + lidx, + torch.zeros( + (test_config.num_layers, num_global_experts), + dtype=torch.int32, + device=device, + ), + logical_to_physical_map, + logical_replica_count, + ) + + out_after_shuffle = [] + with set_forward_context( + {}, + num_tokens=test_config.num_tokens, + num_tokens_across_dp=torch.tensor( + [test_config.num_tokens] * world_size, device="cpu", dtype=torch.int + ), + vllm_config=vllm_config, + ): + for lidx, fml in enumerate(fml_layers): + out_after_shuffle.append( + fml(hidden_states[lidx].clone(), router_logits[lidx].clone()) + ) + + for lidx in range(test_config.num_layers): + torch.testing.assert_close( + out_before_shuffle[lidx], out_after_shuffle[lidx], atol=1e-1, rtol=1e-1 + ) + + +@pytest.mark.parametrize("world_size", [2, 4]) +@pytest.mark.parametrize("num_layers", [8]) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("hidden_size", [256]) +@pytest.mark.parametrize("intermediate_size", [256]) +@pytest.mark.parametrize("num_tokens", [256]) +@pytest.mark.parametrize("backend", ["latency", "throughput"]) +def test_eplb_fml( + world_size: int, + num_layers: int, + num_experts: int, + hidden_size: int, + intermediate_size: int, + num_tokens: int, + backend: str, + monkeypatch, +): + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") + monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", backend) + + if torch.cuda.device_count() < world_size: + pytest.skip(f"Need at least {world_size} GPUs to run the test") + + num_local_experts = num_experts // world_size + num_topk = 4 + + test_config = TestConfig( + num_layers=num_layers, + num_experts=num_experts, + num_local_experts=num_local_experts, + num_topk=num_topk, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + num_tokens=num_tokens, + ) + + distributed_run( + _test_eplb_fml, + world_size, + test_config, + ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index e825cb33c3580..18a0fe6fbbb44 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -38,6 +38,7 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( build_flashinfer_fp4_cutlass_moe_prepare_finalize, flashinfer_trtllm_fp4_moe, + flashinfer_trtllm_fp4_routed_moe, prepare_static_weights_for_trtllm_fp4_moe, reorder_w1w3_to_w3w1, select_nvfp4_gemm_impl, @@ -1325,7 +1326,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): "Accuracy may be affected." ) - w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0] + w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0].contiguous() layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False) # Common processing for input scales and alphas @@ -1482,6 +1483,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): a2_gscale=layer.w2_input_scale_quant, ) + @property + def supports_eplb(self) -> bool: + return True + def apply( self, layer: FusedMoE, @@ -1500,11 +1505,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): if ( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + and not layer.enable_eplb ): - if layer.enable_eplb: - raise NotImplementedError( - "EPLB not supported for `ModelOptNvFp4FusedMoE` yet." - ) return flashinfer_trtllm_fp4_moe( layer=layer, x=x, @@ -1522,6 +1524,20 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): router_logits=router_logits, ) + # EPLB path + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + return flashinfer_trtllm_fp4_routed_moe( + layer=layer, + x=x, + topk_ids=topk_ids, + topk_weights=topk_weights, + top_k=layer.top_k, + global_num_experts=layer.global_num_experts, + ) + if self.use_marlin: return fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index eda40657b1e39..8f96222f19f20 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -331,3 +331,82 @@ def flashinfer_trtllm_fp4_moe( )[0] return out + + +def flashinfer_trtllm_fp4_routed_moe( + layer: torch.nn.Module, + x: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + top_k: int, + global_num_experts: int, +) -> torch.Tensor: + """ + Apply FlashInfer TensorRT-LLM FP4 MoE kernel. Uses packed + input top k expert indices and scores rather than computing + top k expert indices from scores. + + Args: + layer: The MoE layer with weights and scales + x: Input tensor + topk_ids: Ids of selected experts + top_k: Number of experts to select per token + global_num_experts: Total number of experts across all ranks + + Returns: + Output tensor from the MoE layer + """ + import flashinfer + + # Pack top k ids and expert weights into a single int32 tensor, as + # required by TRT-LLM + packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to( + torch.bfloat16 + ).view(torch.int16) + + # Quantize input to FP4 + a1_gscale = layer.w13_input_scale_quant + (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( + x, + a1_gscale, + is_sf_swizzled_layout=False, + ) + + # Call TRT-LLM FP4 block-scale MoE kernel + out = flashinfer.fused_moe.trtllm_fp4_block_scale_routed_moe( + topk_ids=packed_tensor, + routing_bias=None, + hidden_states=hidden_states_fp4, + hidden_states_scale=hidden_states_scale_linear_fp4.view( + torch.float8_e4m3fn + ).flatten(), + gemm1_weights=layer.gemm1_weights_fp4_shuffled.data, + gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn + ), + gemm1_bias=None, + gemm1_alpha=None, + gemm1_beta=None, + gemm1_clamp_limit=None, + gemm2_weights=layer.gemm2_weights_fp4_shuffled.data, + gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn + ), + gemm2_bias=None, + output1_scale_scalar=layer.g1_scale_c.data, + output1_scale_gate_scalar=layer.g1_alphas.data, + output2_scale_scalar=layer.g2_alphas.data, + num_experts=global_num_experts, + top_k=top_k, + n_group=0, + topk_group=0, + intermediate_size=layer.intermediate_size_per_partition, + local_expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + routed_scaling_factor=None, + tile_tokens_dim=None, + routing_method_type=1, + do_finalize=True, + )[0] + + return out From 2cc5affc388d3d134bacc14f042405ead925531b Mon Sep 17 00:00:00 2001 From: Concurrensee Date: Thu, 11 Dec 2025 17:03:54 -0600 Subject: [PATCH 10/57] [ROCM][CI] Fix AMD Examples Test Group (#30276) Signed-off-by: Yida Wu Signed-off-by: Yida --- .buildkite/test-amd.yaml | 3 +-- examples/offline_inference/basic/embed.py | 8 ++++++++ examples/offline_inference/basic/score.py | 8 ++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 4038d32834e68..4e957634e7b47 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -435,7 +435,7 @@ steps: - label: Examples Test # 30min timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking working_dir: "/vllm-workspace/examples" @@ -455,7 +455,6 @@ steps: # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 # for pooling models diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index eeb7137ff7bae..17f727b33d321 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -4,6 +4,9 @@ from argparse import Namespace from vllm import LLM, EngineArgs +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.config import AttentionConfig +from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -20,6 +23,11 @@ def parse_args(): def main(args: Namespace): + if current_platform.is_rocm(): + args.attention_config = AttentionConfig( + backend=AttentionBackendEnum.FLEX_ATTENTION + ) + # Sample prompts. prompts = [ "Hello, my name is", diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index cbca50eb5efa8..b2dadffd249f5 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -4,6 +4,9 @@ from argparse import Namespace from vllm import LLM, EngineArgs +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.config import AttentionConfig +from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -20,6 +23,11 @@ def parse_args(): def main(args: Namespace): + if current_platform.is_rocm(): + args.attention_config = AttentionConfig( + backend=AttentionBackendEnum.FLEX_ATTENTION + ) + # Sample prompts. text_1 = "What is the capital of France?" texts_2 = [ From d527cf0b3d4210c4277f258c9d26286cec726a6f Mon Sep 17 00:00:00 2001 From: Ev Lacey Date: Thu, 11 Dec 2025 15:36:31 -0800 Subject: [PATCH 11/57] [FIX]Patch run-cluster.sh (fix for #28328) (#30002) Signed-off-by: elacey Signed-off-by: Ev Lacey --- examples/online_serving/run_cluster.sh | 60 +++++++++++++++----------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/examples/online_serving/run_cluster.sh b/examples/online_serving/run_cluster.sh index 0756d4b0ae556..5996098eb25aa 100644 --- a/examples/online_serving/run_cluster.sh +++ b/examples/online_serving/run_cluster.sh @@ -21,7 +21,7 @@ # --worker \ # /abs/path/to/huggingface/cache \ # -e VLLM_HOST_IP= -# +# # Each worker requires a unique VLLM_HOST_IP value. # Keep each terminal session open. Closing a session stops the associated Ray # node and thereby shuts down the entire cluster. @@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then exit 1 fi +# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=..."). +VLLM_HOST_IP="" +for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do + arg="${ADDITIONAL_ARGS[$i]}" + case "${arg}" in + -e) + next="${ADDITIONAL_ARGS[$((i + 1))]:-}" + if [[ "${next}" == VLLM_HOST_IP=* ]]; then + VLLM_HOST_IP="${next#VLLM_HOST_IP=}" + break + fi + ;; + -eVLLM_HOST_IP=* | VLLM_HOST_IP=*) + VLLM_HOST_IP="${arg#*=}" + break + ;; + esac +done + +# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent. +if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then + if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then + echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})." + echo "Using VLLM_HOST_IP as the head node address." + HEAD_NODE_ADDRESS="${VLLM_HOST_IP}" + fi +fi + # Generate a unique container name with random suffix. # Docker container names must be unique on each host. # The random suffix allows multiple Ray containers to run simultaneously on the same machine, @@ -74,36 +102,17 @@ cleanup() { trap cleanup EXIT # Build the Ray start command based on the node role. -# The head node manages the cluster and accepts connections on port 6379, +# The head node manages the cluster and accepts connections on port 6379, # while workers connect to the head's address. RAY_START_CMD="ray start --block" if [ "${NODE_TYPE}" == "--head" ]; then - RAY_START_CMD+=" --head --port=6379" + RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379" else + RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379" -fi - -# Parse VLLM_HOST_IP from additional args if present. -# This is needed for multi-NIC configurations where Ray needs explicit IP bindings. -VLLM_HOST_IP="" -for arg in "${ADDITIONAL_ARGS[@]}"; do - if [[ $arg == "-e" ]]; then - continue + if [ -n "${VLLM_HOST_IP}" ]; then + RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}" fi - if [[ $arg == VLLM_HOST_IP=* ]]; then - VLLM_HOST_IP="${arg#VLLM_HOST_IP=}" - break - fi -done - -# Build Ray IP environment variables if VLLM_HOST_IP is set. -# These variables ensure Ray binds to the correct network interface on multi-NIC systems. -RAY_IP_VARS=() -if [ -n "${VLLM_HOST_IP}" ]; then - RAY_IP_VARS=( - -e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}" - -e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}" - ) fi # Launch the container with the assembled parameters. @@ -118,6 +127,5 @@ docker run \ --shm-size 10.24g \ --gpus all \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ - "${RAY_IP_VARS[@]}" \ "${ADDITIONAL_ARGS[@]}" \ "${DOCKER_IMAGE}" -c "${RAY_START_CMD}" From 48661d275fb44b969112a7bd8586dfd9f498e2e3 Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 11 Dec 2025 18:24:20 -0600 Subject: [PATCH 12/57] [CI/Build][AMD] Skip tests in test_fusions_e2e and test_dbo_dp_ep_gsm8k that require non-existing imports for ROCm (#30417) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/compile/distributed/test_fusions_e2e.py | 26 ++++++++++++++++++- tests/v1/distributed/test_dbo.py | 2 ++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 75a81efedea3b..5379b5157b811 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -138,6 +138,17 @@ elif current_platform.is_rocm(): CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] +def has_cuda_graph_wrapper_metadata() -> bool: + from importlib import import_module + + try: + module = import_module("torch._inductor.utils") + module.CUDAGraphWrapperMetadata # noqa B018 + except AttributeError: + return False + return True + + @pytest.mark.parametrize( "model_name, model_kwargs, backend, matches, custom_ops", # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8 @@ -145,7 +156,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] # quant_fp4 only has the custom impl + list(flat_product(MODELS_FP4, [""])), ) -@pytest.mark.parametrize("inductor_graph_partition", [True, False]) +@pytest.mark.parametrize( + "inductor_graph_partition", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + not has_cuda_graph_wrapper_metadata(), + reason="This test requires" + "torch._inductor.utils.CUDAGraphWrapperMetadata to run", + ), + ), + False, + ], +) def test_attn_quant( model_name: str, model_kwargs: dict[str, Any], diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py index f3a159762ea54..e5cbe1ce85e96 100644 --- a/tests/v1/distributed/test_dbo.py +++ b/tests/v1/distributed/test_dbo.py @@ -13,6 +13,7 @@ import torch from tests.evals.gsm8k.gsm8k_eval import evaluate_gsm8k from tests.utils import RemoteOpenAIServer +from vllm.utils.import_utils import has_deep_ep # Detect Blackwell / B200 (compute capability 10.x) try: @@ -44,6 +45,7 @@ DEEPEP_BACKENDS = [ ] +@pytest.mark.skipif(not has_deep_ep(), reason="These tests require deep_ep to run") @pytest.mark.parametrize("all2all_backend", DEEPEP_BACKENDS) @pytest.mark.xfail( IS_BLACKWELL, From 0ab23c2b2be1cdbde41b824186f57343f102e306 Mon Sep 17 00:00:00 2001 From: jiahanc <173873397+jiahanc@users.noreply.github.com> Date: Thu, 11 Dec 2025 17:00:58 -0800 Subject: [PATCH 13/57] [fix] fix SM check for Flashinfer TRTLLM MOE (#30314) Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com> --- .../layers/quantization/utils/flashinfer_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 00c2720a34875..ba3653e4b5ea7 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -290,7 +290,7 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend: if flashinfer_moe_backend in backend_map: if ( flashinfer_moe_backend == "latency" - and not current_platform.is_device_capability(100) + and not current_platform.has_device_capability(100) ): logger.info_once( "Flashinfer TRTLLM MOE backend is only supported on " From ba809266818cfb9e63bcb34d79fdd77af6e308fe Mon Sep 17 00:00:00 2001 From: rasmith Date: Thu, 11 Dec 2025 19:02:19 -0600 Subject: [PATCH 14/57] [CI/Build][AMD] Skip test_cutlass_w4a8_moe tests on ROCm sine they require cutlass_pack_scale_fp8 (#30508) Signed-off-by: Randall Smith Signed-off-by: Michael Goin Signed-off-by: mgoin Co-authored-by: Randall Smith Co-authored-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- tests/kernels/quantization/test_cutlass_w4a8_moe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py index 3560402a29e90..a855f7333b617 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py +++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py @@ -18,7 +18,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types -IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9 +IS_SUPPORTED_BY_GPU = ( + current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9 +) def to_fp8(tensor: torch.Tensor) -> torch.Tensor: From b5945d49c08b66658110fa1c63e55fde66fcfad7 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 11 Dec 2025 19:37:24 -0600 Subject: [PATCH 15/57] [ROCm][CI] Use mi325_4 agent pool for V1 e2e tests (#30526) Signed-off-by: Andreas Karatzas --- .buildkite/test-amd.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 4e957634e7b47..c7d460be6e2b5 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -326,10 +326,10 @@ steps: commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py -- label: V1 Test e2e + engine # 30min - timeout_in_minutes: 45 +- label: V1 Test e2e + engine # 65min + timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_1 + agent_pool: mi325_4 # grade: Blocking source_file_dependencies: - vllm/ From 042da732445f5cef93cb83e1045333544e61a0a1 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 11 Dec 2025 20:54:12 -0500 Subject: [PATCH 16/57] [Core] Refactor `_build_attention_metadata` (#29628) Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 248 ++++++++++++++--------------- 1 file changed, 123 insertions(+), 125 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 40c8059f90d34..3f20296c27ba7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1534,28 +1534,13 @@ class GPUModelRunner( """ :return: tuple[attn_metadata, spec_decode_common_attn_metadata] """ + # Attention metadata is not needed for attention free models + if len(self.kv_cache_config.kv_cache_groups) == 0: + return {}, None + num_tokens_padded = num_tokens_padded or num_tokens num_reqs_padded = num_reqs_padded or num_reqs - - logits_indices_padded = None - num_logits_indices = None - if logits_indices is not None: - num_logits_indices = logits_indices.size(0) - if self.cache_config.kv_sharing_fast_prefill: - logits_indices_padded = self._prepare_kv_sharing_fast_prefill( - logits_indices - ) - - # update seq_lens of decode reqs under DCP. - if self.dcp_world_size > 1: - self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens( - self.seq_lens.cpu[:num_reqs], - self.dcp_world_size, - self.dcp_rank, - self.parallel_config.cp_kv_cache_interleave_size, - ) - self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0) - self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded) + assert num_reqs_padded is not None and num_tokens_padded is not None attn_metadata: PerLayerAttnMetadata = {} if ubatch_slices is not None: @@ -1576,36 +1561,12 @@ class GPUModelRunner( self.num_accepted_tokens.np[num_reqs:].fill(1) self.num_accepted_tokens.copy_to_gpu() - # Used in the below loop, uses padded shapes - query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1] - query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1] - seq_lens = self.seq_lens.gpu[:num_reqs_padded] - seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded] - num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[ - :num_reqs_padded - ] + kv_cache_groups = self.kv_cache_config.kv_cache_groups - dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None - if self.dcp_world_size > 1: - dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded] - dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded] - - spec_decode_common_attn_metadata = None - - # Prepare the attention metadata for each KV cache group and make layers - # in the same group share the same metadata. - for kv_cache_gid, kv_cache_group in enumerate( - self.kv_cache_config.kv_cache_groups - ): - encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens( - num_scheduled_tokens or {}, - kv_cache_group.kv_cache_spec, - num_reqs_padded, - ) - - if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec): - # Encoder-only layers do not have KV cache, so we need to - # create a dummy block table and slot mapping for them. + def _get_block_table_and_slot_mapping(kv_cache_gid: int): + assert num_reqs_padded is not None and num_tokens_padded is not None + kv_cache_spec = kv_cache_groups[kv_cache_gid].kv_cache_spec + if isinstance(kv_cache_spec, EncoderOnlyAttentionSpec): blk_table_tensor = torch.zeros( (num_reqs_padded, 1), dtype=torch.int32, @@ -1621,92 +1582,129 @@ class GPUModelRunner( blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded) slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded] - # Fill unused with -1. Needed for reshape_and_cache in full cuda - # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID - slot_mapping[num_tokens:num_tokens_padded].fill_(-1) - blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1) + # Fill unused with -1. Needed for reshape_and_cache in full cuda + # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID + slot_mapping[num_tokens:num_tokens_padded].fill_(-1) + blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1) - common_attn_metadata = CommonAttentionMetadata( - query_start_loc=query_start_loc, - query_start_loc_cpu=query_start_loc_cpu, - seq_lens=seq_lens, - _seq_lens_cpu=seq_lens_cpu, - _num_computed_tokens_cpu=num_computed_tokens_cpu, - num_actual_tokens=num_tokens_padded, - num_reqs=num_reqs_padded, - max_query_len=max_query_len, - max_seq_len=max_seq_len, - block_table_tensor=blk_table_tensor, - slot_mapping=slot_mapping, - logits_indices_padded=logits_indices_padded, - num_logits_indices=num_logits_indices, - causal=True, - encoder_seq_lens=encoder_seq_lens, - encoder_seq_lens_cpu=encoder_seq_lens_cpu, - dcp_local_seq_lens=dcp_local_seq_lens, - dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu, + return blk_table_tensor, slot_mapping + + block_table_gid_0, slot_mapping_gid_0 = _get_block_table_and_slot_mapping(0) + cm_base = CommonAttentionMetadata( + query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1], + query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1], + seq_lens=self.seq_lens.gpu[:num_reqs_padded], + _seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded], + _num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[ + :num_reqs_padded + ], + num_reqs=num_reqs_padded, + num_actual_tokens=num_tokens_padded, + max_query_len=max_query_len, + max_seq_len=max_seq_len, + block_table_tensor=block_table_gid_0, + slot_mapping=slot_mapping_gid_0, + causal=True, + ) + + if self.dcp_world_size > 1: + self.dcp_local_seq_lens.cpu[:num_reqs] = get_dcp_local_seq_lens( + self.seq_lens.cpu[:num_reqs], + self.dcp_world_size, + self.dcp_rank, + self.parallel_config.cp_kv_cache_interleave_size, ) + self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0) + self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded) + + cm_base.dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded] + cm_base.dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[ + :num_reqs_padded + ] + + if logits_indices is not None and self.cache_config.kv_sharing_fast_prefill: + cm_base.num_logits_indices = logits_indices.size(0) + cm_base.logits_indices_padded = self._prepare_kv_sharing_fast_prefill( + logits_indices + ) + + def _build_attn_group_metadata( + kv_cache_gid: int, + attn_gid: int, + common_attn_metadata: CommonAttentionMetadata, + ubid: int | None = None, + ) -> None: + attn_group = self.attn_groups[kv_cache_gid][attn_gid] + cascade_attn_prefix_len = ( + cascade_attn_prefix_lens[kv_cache_gid][attn_gid] + if cascade_attn_prefix_lens + else 0 + ) + + builder = attn_group.get_metadata_builder(ubid or 0) + extra_attn_metadata_args = {} + if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder): + assert ubid is None, "UBatching not supported with GDN yet" + extra_attn_metadata_args = dict( + num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs_padded], + num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[ + :num_reqs_padded + ], + ) + + if for_cudagraph_capture: + attn_metadata_i = builder.build_for_cudagraph_capture( + common_attn_metadata + ) + else: + attn_metadata_i = builder.build( + common_prefix_len=cascade_attn_prefix_len, + common_attn_metadata=common_attn_metadata, + **extra_attn_metadata_args, + ) + + if ubid is None: + assert isinstance(attn_metadata, dict) + attn_metadata_dict = attn_metadata + else: + assert isinstance(attn_metadata, list) + attn_metadata_dict = attn_metadata[ubid] + + for layer_name in attn_group.layer_names: + attn_metadata_dict[layer_name] = attn_metadata_i + + # Prepare the attention metadata for each KV cache group and make layers + # in the same group share the same metadata. + spec_decode_common_attn_metadata = None + for kv_cache_gid, kv_cache_group in enumerate(kv_cache_groups): + cm = copy(cm_base) # shallow copy + + # Basically only the encoder seq_lens, block_table and slot_mapping change + # for each kv_cache_group. + cm.encoder_seq_lens, cm.encoder_seq_lens_cpu = self._get_encoder_seq_lens( + num_scheduled_tokens or {}, + kv_cache_group.kv_cache_spec, + num_reqs_padded, + ) + if kv_cache_gid > 0: + cm.block_table_tensor, cm.slot_mapping = ( + _get_block_table_and_slot_mapping(kv_cache_gid) + ) if self.speculative_config and spec_decode_common_attn_metadata is None: if isinstance(self.drafter, EagleProposer): if self.drafter.attn_layer_names[0] in kv_cache_group.layer_names: - spec_decode_common_attn_metadata = common_attn_metadata + spec_decode_common_attn_metadata = cm else: - spec_decode_common_attn_metadata = common_attn_metadata - - for attn_gid, attn_group in enumerate(self.attn_groups[kv_cache_gid]): - cascade_attn_prefix_len = ( - cascade_attn_prefix_lens[kv_cache_gid][attn_gid] - if cascade_attn_prefix_lens - else 0 - ) - builder = attn_group.get_metadata_builder() - - extra_attn_metadata_args = {} - if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder): - extra_attn_metadata_args = dict( - num_accepted_tokens=self.num_accepted_tokens.gpu[ - :num_reqs_padded - ], - num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[ - :num_reqs_padded - ], - ) + spec_decode_common_attn_metadata = cm + for attn_gid in range(len(self.attn_groups[kv_cache_gid])): if ubatch_slices is not None: - common_attn_metadata_list = split_attn_metadata( - ubatch_slices, common_attn_metadata - ) - for ubid, common_attn_metadata in enumerate( - common_attn_metadata_list - ): - builder = attn_group.get_metadata_builder(ubatch_id=ubid) - if for_cudagraph_capture: - attn_metadata_i = builder.build_for_cudagraph_capture( - common_attn_metadata - ) - else: - attn_metadata_i = builder.build( - common_prefix_len=cascade_attn_prefix_len, - common_attn_metadata=common_attn_metadata, - ) - for layer_name in kv_cache_group.layer_names: - assert type(attn_metadata) is list - attn_metadata[ubid][layer_name] = attn_metadata_i + for ubid, _cm in enumerate(split_attn_metadata(ubatch_slices, cm)): + _build_attn_group_metadata(kv_cache_gid, attn_gid, _cm, ubid) + else: - assert isinstance(attn_metadata, dict) - if for_cudagraph_capture: - attn_metadata_i = builder.build_for_cudagraph_capture( - common_attn_metadata - ) - else: - attn_metadata_i = builder.build( - common_prefix_len=cascade_attn_prefix_len, - common_attn_metadata=common_attn_metadata, - **extra_attn_metadata_args, - ) - for layer_name in attn_group.layer_names: - attn_metadata[layer_name] = attn_metadata_i + _build_attn_group_metadata(kv_cache_gid, attn_gid, cm) if self.is_mm_prefix_lm: req_doc_ranges = {} From f355ad5412bc414a2a55f55481cb4aa1d909b4a3 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Fri, 12 Dec 2025 02:09:25 +0000 Subject: [PATCH 17/57] [CPU][FIX] Fix build failures on Arm CPUs with torch nightly (#30481) Signed-off-by: Fadi Arafeh --- cmake/utils.cmake | 23 ++++++++++++++--------- vllm/platforms/cpu.py | 14 ++++++++++---- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 5047c354ff7d2..bdb2ba74d944d 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -140,16 +140,21 @@ function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR) run_python(_VLLM_TORCH_GOMP_PATH " import os, glob -try: - import torch - torch_pkg = os.path.dirname(torch.__file__) - site_root = os.path.dirname(torch_pkg) - torch_libs = os.path.join(site_root, 'torch.libs') - print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0]) -except: - print('') +import torch +torch_pkg = os.path.dirname(torch.__file__) +site_root = os.path.dirname(torch_pkg) + +# Search both torch.libs and torch/lib +roots = [os.path.join(site_root, 'torch.libs'), os.path.join(torch_pkg, 'lib')] +candidates = [] +for root in roots: + if not os.path.isdir(root): + continue + candidates.extend(glob.glob(os.path.join(root, 'libgomp*.so*'))) + +print(candidates[0] if candidates else '') " - "failed to probe torch.libs for libgomp") + "failed to probe for libgomp") if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}") return() diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index a49b6e92df00d..d961dcf13e53e 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -325,10 +325,16 @@ class CpuPlatform(Platform): # We need to find the location of PyTorch's libgomp torch_pkg = os.path.dirname(torch.__file__) site_root = os.path.dirname(torch_pkg) - torch_libs = os.path.join(site_root, "torch.libs") - pytorch_libgomp_so_candidates = glob.glob( - os.path.join(torch_libs, "libgomp-*.so*") - ) + # Search both torch.libs and torch/lib - See: https://github.com/vllm-project/vllm/issues/30470 + torch_libs_paths = [ + os.path.join(site_root, "torch.libs"), + os.path.join(torch_pkg, "lib"), + ] + pytorch_libgomp_so_candidates = [] + for torch_libs in torch_libs_paths: + pytorch_libgomp_so_candidates.extend( + glob.glob(os.path.join(torch_libs, "libgomp*.so*")) + ) if pytorch_libgomp_so_candidates: pytorch_libgomp_so = pytorch_libgomp_so_candidates[0] if ld_preload_str: From 6a6fc41c799916521b1fa2914f72e108352e1bf6 Mon Sep 17 00:00:00 2001 From: Bhanu Prakash Voutharoja <59905694+Bhanu068@users.noreply.github.com> Date: Fri, 12 Dec 2025 13:27:22 +1100 Subject: [PATCH 18/57] gptq marlin quantization support for fused moe with lora (#30254) Signed-off-by: Bhanu068 --- csrc/moe/marlin_moe_wna16/ops.cu | 2 +- .../model_executor/layers/fused_moe/config.py | 36 ++++++ .../layers/quantization/gptq_marlin.py | 110 +++++++++++++++++- 3 files changed, 146 insertions(+), 2 deletions(-) diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu index 27b6ffaa67176..4fd8fc5c54202 100644 --- a/csrc/moe/marlin_moe_wna16/ops.cu +++ b/csrc/moe/marlin_moe_wna16/ops.cu @@ -860,4 +860,4 @@ torch::Tensor moe_wna16_marlin_gemm( TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm); -} +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 5eb6bc4829adf..a9a2990ca2b53 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -543,6 +543,42 @@ def int8_w8a8_moe_quant_config( ) +def gptq_marlin_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + weight_bits: int, + group_size: int, + w1_zp: torch.Tensor | None = None, + w2_zp: torch.Tensor | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +): + """ + Construct a quant config for gptq marlin quantization. + """ + from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape + + w_shape = None if group_size == -1 else GroupShape(row=1, col=group_size) + + # Activations are NOT quantized for GPTQ (fp16/bf16) + a_shape = w_shape # Same as weight shape for alignment + + # Determine weight dtype + if weight_bits == 4: + weight_dtype = "int4" + elif weight_bits == 8: + weight_dtype = torch.int8 + else: + raise ValueError(f"Unsupported weight_bits: {weight_bits}") + + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(dtype=None, shape=a_shape), + _a2=FusedMoEQuantDesc(dtype=None, shape=a_shape), + _w1=FusedMoEQuantDesc(weight_dtype, w_shape, w1_scale, None, w1_zp, w1_bias), + _w2=FusedMoEQuantDesc(weight_dtype, w_shape, w2_scale, None, w2_zp, w2_bias), + ) + + def mxfp4_w4a16_moe_quant_config( w1_scale: Union[torch.Tensor, "PrecisionConfig"], w2_scale: Union[torch.Tensor, "PrecisionConfig"], diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 8d1715f52f097..6e5dcfe59b2f9 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -732,6 +732,14 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): is_a_8bit=is_a_8bit, ) replace_parameter(layer, "w2_qweight", marlin_w2_qweight) + + # The modular kernel expects w13_weight and w2_weight, + # but GPTQ uses w13_qweight and w2_qweight + # Alias for modular kernel + layer.w13_weight = layer.w13_qweight + # Alias for modular kernel + layer.w2_weight = layer.w2_qweight + # Repack scales marlin_w13_scales = marlin_moe_permute_scales( s=layer.w13_scales, @@ -782,7 +790,107 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - return None + from vllm.model_executor.layers.fused_moe.config import ( + gptq_marlin_moe_quant_config, + ) + + return gptq_marlin_moe_quant_config( + w1_scale=layer.w13_scales, + w2_scale=layer.w2_scales, + weight_bits=self.quant_config.weight_bits, + group_size=self.quant_config.group_size, + w1_zp=getattr(layer, "w13_qzeros", None) + if not self.quant_config.is_sym + else None, + w2_zp=getattr(layer, "w2_qzeros", None) + if not self.quant_config.is_sym + else None, + w1_bias=getattr(layer, "w13_bias", None), + w2_bias=getattr(layer, "w2_bias", None), + ) + + def select_gemm_impl( + self, + prepare_finalize, + layer: torch.nn.Module, + ): + """ + Select the GEMM implementation for GPTQ-Marlin MoE. + + Returns MarlinExperts configured for GPTQ quantization. + This is ONLY used when LoRA is enabled. + Without LoRA, GPTQ uses its own apply() method. + """ + # Only use modular kernels when LoRA is enabled + # Without LoRA, GPTQ's own apply() method works fine and is more efficient + if not self.moe.is_lora_enabled: + raise NotImplementedError( + "GPTQ-Marlin uses its own apply() method when LoRA is not enabled. " + "Modular kernels are only used for LoRA support." + ) + + # The modular marlin kernels do not support 8-bit weights. + if self.quant_config.weight_bits == 8: + raise NotImplementedError( + "GPTQ-Marlin kernel does not support 8-bit weights." + ) + + from vllm.model_executor.layers.fused_moe import modular_kernel as mk + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + BatchedMarlinExperts, + MarlinExperts, + ) + + # Ensure quant config is initialized + assert self.moe_quant_config is not None, ( + "moe_quant_config must be initialized before select_gemm_impl" + ) + + w13_g_idx = ( + getattr(layer, "w13_g_idx", None) if self.quant_config.desc_act else None + ) + w2_g_idx = ( + getattr(layer, "w2_g_idx", None) if self.quant_config.desc_act else None + ) + w13_g_idx_sort_indices = ( + getattr(layer, "w13_g_idx_sort_indices", None) + if self.quant_config.desc_act + else None + ) + w2_g_idx_sort_indices = ( + getattr(layer, "w2_g_idx_sort_indices", None) + if self.quant_config.desc_act + else None + ) + + # Check if using batched expert format (for Expert Parallelism) + if ( + prepare_finalize.activation_format + == mk.FusedMoEActivationFormat.BatchedExperts + ): + # For batched format, use BatchedMarlinExperts + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens_per_rank is not None + return BatchedMarlinExperts( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + w13_g_idx=w13_g_idx, + w2_g_idx=w2_g_idx, + w13_g_idx_sort_indices=w13_g_idx_sort_indices, + w2_g_idx_sort_indices=w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) + else: + # Standard Marlin experts for GPTQ + return MarlinExperts( + quant_config=self.moe_quant_config, + w13_g_idx=w13_g_idx, + w2_g_idx=w2_g_idx, + w13_g_idx_sort_indices=w13_g_idx_sort_indices, + w2_g_idx_sort_indices=w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) def apply( self, From 9f2fc16a6903f8988515ce2560d3ef0850809c42 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 11 Dec 2025 21:53:57 -0500 Subject: [PATCH 19/57] [Bugfix][Model] Fix Afmoe rope_parameters issue (#30505) Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 5 +---- vllm/model_executor/models/afmoe.py | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 020cb749341a6..18056a9657e82 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -173,10 +173,7 @@ class _HfExamplesInfo: _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] - "AfmoeForCausalLM": _HfExamplesInfo( - "arcee-ai/Trinity-Nano", - is_available_online=False, - ), + "AfmoeForCausalLM": _HfExamplesInfo("arcee-ai/Trinity-Nano-Preview"), "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"), "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True), "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True), diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py index 3ced52c2050d6..f5dfe43067414 100644 --- a/vllm/model_executor/models/afmoe.py +++ b/vllm/model_executor/models/afmoe.py @@ -242,7 +242,7 @@ class AfmoeAttention(nn.Module): self.rotary_emb = get_rope( self.head_dim, max_position=max_position_embeddings, - rope_parameters=config["rope_parameters"], + rope_parameters=config.rope_parameters, is_neox_style=True, ) else: From 947dfda9c281c2b2d779a29e73bbc20170dcfab3 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 11 Dec 2025 19:18:47 -0800 Subject: [PATCH 20/57] [LMCache] Relax lmcache version requirement (#30425) Signed-off-by: Nick Hill --- requirements/kv_connectors.txt | 2 +- .../v1/lmcache_integration/vllm_v1_adapter.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt index f60a01a55d07c..083230c171096 100644 --- a/requirements/kv_connectors.txt +++ b/requirements/kv_connectors.txt @@ -1,2 +1,2 @@ -lmcache >= 0.3.10.post1 +lmcache nixl >= 0.7.1 # Required for disaggregated prefill diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index cdc2969a7735e..09af128f3ed74 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -27,7 +27,14 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import ( LMCacheAsyncLookupServer, ) from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer -from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher + +try: + from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher +except ImportError: + # Backwards compatibility for lmcache <= 0.3.10-post1 + from lmcache.v1.plugin.plugin_launcher import ( + PluginLauncher as RuntimePluginLauncher, + ) from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig From 197473c4e71c99025a0fd3925d0f130bdbfa1e42 Mon Sep 17 00:00:00 2001 From: Ryan Rock Date: Thu, 11 Dec 2025 21:33:17 -0600 Subject: [PATCH 21/57] [CI/Build] Use spawn subprocess for ROCm (#30272) Signed-off-by: Ryan Rock --- examples/offline_inference/data_parallel.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 0b281fc41a341..be0b846995a92 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -33,6 +33,7 @@ import os from time import sleep from vllm import LLM, SamplingParams +from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_port @@ -222,6 +223,11 @@ if __name__ == "__main__": from multiprocessing import Process + if current_platform.is_rocm(): + from multiprocessing import set_start_method + + set_start_method("spawn", force=True) + procs = [] for local_dp_rank, global_dp_rank in enumerate( range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node) From 783644e4ac7d4ee324a1817bbe199fc5b557bc7d Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 11 Dec 2025 21:54:56 -0600 Subject: [PATCH 22/57] [ROCm][CI] Skip multi-GPU speculative decoding tests when insufficient GPUs available (#30527) Signed-off-by: Andreas Karatzas --- tests/v1/e2e/test_spec_decode.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index c8587659d6580..fcfc8bdce12e9 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -16,6 +16,16 @@ from vllm.platforms import current_platform MTP_SIMILARITY_RATE = 0.8 +def _skip_if_insufficient_gpus_for_tp(tp_size: int): + """Skip test if available GPUs < tp_size on ROCm.""" + if current_platform.is_rocm(): + available_gpus = torch.cuda.device_count() + if available_gpus < tp_size: + pytest.skip( + f"Test requires {tp_size} GPUs, but only {available_gpus} available" + ) + + def get_test_prompts(mm_enabled: bool): prompt_types = ["repeat", "sentence"] if mm_enabled: @@ -455,6 +465,8 @@ def test_eagle_correctness( m.setenv("VLLM_ROCM_USE_AITER", "1") method, model_name, spec_model_name, tp_size = model_setup + _skip_if_insufficient_gpus_for_tp(tp_size) + max_model_len = 2048 max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len @@ -525,6 +537,7 @@ def test_mtp_correctness( m.setenv("VLLM_MLA_DISABLE", "1") method, model_name, tp_size = model_setup + _skip_if_insufficient_gpus_for_tp(tp_size) ref_llm = LLM( model=model_name, From fe1787107e5214ffb1f0943bf9dd0215cf85ebf2 Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Thu, 11 Dec 2025 23:30:51 -0500 Subject: [PATCH 23/57] [compile] Parse compile range cache keys as Range during cache loading. (#30516) Signed-off-by: zhxchen17 --- vllm/compilation/backends.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index dd2233522263d..8fcd2b42e13bb 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -141,7 +141,25 @@ class CompilerManager: # we use ast.literal_eval to parse the data # because it is a safe way to parse Python literals. # do not use eval(), it is unsafe. - self.cache = ast.literal_eval(f.read()) + cache = ast.literal_eval(f.read()) + + def check_type(value, ty): + if not isinstance(value, ty): + raise TypeError(f"Expected {ty} but got {type(value)} for {value}") + + def parse_key(key: Any) -> tuple[Range, int, str]: + range_tuple, graph_index, compiler_name = key + check_type(graph_index, int) + check_type(compiler_name, str) + if isinstance(range_tuple, tuple): + start, end = range_tuple + check_type(start, int) + check_type(end, int) + range_tuple = Range(start=start, end=end) + check_type(range_tuple, Range) + return range_tuple, graph_index, compiler_name + + self.cache = {parse_key(key): value for key, value in cache.items()} self.compiler.initialize_cache( cache_dir=cache_dir, disable_cache=disable_cache, prefix=prefix From 8f8fda261a620234fdeea338f44093d5d8072879 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 11 Dec 2025 23:59:53 -0500 Subject: [PATCH 24/57] [Bugfix] Multiple fixes for gpt-oss Chat Completion prompting (#28729) Signed-off-by: Ben Browning Co-authored-by: Chauncey --- .../openai/parser/test_harmony_utils.py | 807 +++++++++++++++--- tests/entrypoints/openai/test_serving_chat.py | 646 +++++++++++++- tests/entrypoints/openai/utils.py | 190 +++++ .../openai/parser/harmony_utils.py | 225 ++++- vllm/entrypoints/openai/serving_chat.py | 13 +- .../openai/tool_parsers/openai_tool_parser.py | 7 +- 6 files changed, 1749 insertions(+), 139 deletions(-) create mode 100644 tests/entrypoints/openai/utils.py diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index a3fd80938de6a..1d34fc51ad563 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -1,21 +1,37 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest from openai.types.responses import ResponseFunctionToolCall, ResponseReasoningItem from openai.types.responses.response_output_item import McpCall from openai_harmony import Author, Message, Role, TextContent +from tests.entrypoints.openai.utils import verify_harmony_messages from vllm.entrypoints.openai.parser.harmony_utils import ( + auto_drop_analysis_messages, + get_encoding, has_custom_tools, + parse_chat_input_to_harmony_message, + parse_chat_output, parse_input_to_harmony_message, parse_output_message, ) -class TestParseInputToHarmonyMessage: - """Tests for parse_input_to_harmony_message function.""" +class TestCommonParseInputToHarmonyMessage: + """ + Tests for scenarios that are common to both Chat Completion + parse_chat_input_to_harmony_message and Responsees API + parse_input_to_harmony_message functions. + """ - def test_assistant_message_with_tool_calls(self): + @pytest.fixture( + params=[parse_chat_input_to_harmony_message, parse_input_to_harmony_message] + ) + def parse_function(self, request): + return request.param + + def test_assistant_message_with_tool_calls(self, parse_function): """Test parsing assistant message with tool calls.""" chat_msg = { "role": "assistant", @@ -35,7 +51,7 @@ class TestParseInputToHarmonyMessage: ], } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_function(chat_msg) assert len(messages) == 2 @@ -53,7 +69,7 @@ class TestParseInputToHarmonyMessage: assert messages[1].recipient == "functions.search_web" assert messages[1].content_type == "json" - def test_assistant_message_with_empty_tool_call_arguments(self): + def test_assistant_message_with_empty_tool_call_arguments(self, parse_function): """Test parsing assistant message with tool call having None arguments.""" chat_msg = { "role": "assistant", @@ -67,12 +83,152 @@ class TestParseInputToHarmonyMessage: ], } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_function(chat_msg) assert len(messages) == 1 assert messages[0].content[0].text == "" assert messages[0].recipient == "functions.get_current_time" + def test_system_message(self, parse_function): + """Test parsing system message.""" + chat_msg = { + "role": "system", + "content": "You are a helpful assistant", + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + # System messages are converted using Message.from_dict + # which should preserve the role + assert messages[0].author.role == Role.SYSTEM + + def test_developer_message(self, parse_function): + """Test parsing developer message.""" + chat_msg = { + "role": "developer", + "content": "Use concise language", + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.DEVELOPER + + def test_user_message_with_string_content(self, parse_function): + """Test parsing user message with string content.""" + chat_msg = { + "role": "user", + "content": "What's the weather in San Francisco?", + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.USER + assert messages[0].content[0].text == "What's the weather in San Francisco?" + + def test_user_message_with_array_content(self, parse_function): + """Test parsing user message with array content.""" + chat_msg = { + "role": "user", + "content": [ + {"text": "What's in this image? "}, + {"text": "Please describe it."}, + ], + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.USER + assert len(messages[0].content) == 2 + assert messages[0].content[0].text == "What's in this image? " + assert messages[0].content[1].text == "Please describe it." + + def test_assistant_message_with_string_content(self, parse_function): + """Test parsing assistant message with string content (no tool calls).""" + chat_msg = { + "role": "assistant", + "content": "Hello! How can I help you today?", + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.ASSISTANT + assert messages[0].content[0].text == "Hello! How can I help you today?" + + def test_pydantic_model_input(self, parse_function): + """Test parsing Pydantic model input (has model_dump method).""" + + class MockPydanticModel: + def model_dump(self, exclude_none=True): + return { + "role": "user", + "content": "Test message", + } + + chat_msg = MockPydanticModel() + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].author.role == Role.USER + assert messages[0].content[0].text == "Test message" + + def test_tool_call_with_missing_function_fields(self, parse_function): + """Test parsing tool call with missing name or arguments.""" + chat_msg = { + "role": "assistant", + "tool_calls": [ + { + "function": {} # Missing both name and arguments + } + ], + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert messages[0].recipient == "functions." + assert messages[0].content[0].text == "" + + def test_array_content_with_missing_text(self, parse_function): + """Test parsing array content where text field is missing.""" + chat_msg = { + "role": "user", + "content": [ + {}, # Missing text field + {"text": "actual text"}, + ], + } + + messages = parse_function(chat_msg) + + assert len(messages) == 1 + assert len(messages[0].content) == 2 + assert messages[0].content[0].text == "" + assert messages[0].content[1].text == "actual text" + + +class TestParseInputToHarmonyMessage: + """ + Tests for scenarios that are specific to the Responses API + parse_input_to_harmony_message function. + """ + + def test_message_with_empty_content(self): + """Test parsing message with empty string content.""" + chat_msg = { + "role": "user", + "content": "", + } + + messages = parse_input_to_harmony_message(chat_msg) + + assert len(messages) == 1 + assert messages[0].content[0].text == "" + def test_tool_message_with_string_content(self): """Test parsing tool message with string content.""" chat_msg = { @@ -111,6 +267,7 @@ class TestParseInputToHarmonyMessage: assert len(messages) == 1 assert messages[0].author.role == Role.TOOL + assert messages[0].author.name == "functions.search_results" assert messages[0].content[0].text == "Result 1: Result 2: Result 3" def test_tool_message_with_empty_content(self): @@ -124,140 +281,564 @@ class TestParseInputToHarmonyMessage: messages = parse_input_to_harmony_message(chat_msg) assert len(messages) == 1 + assert messages[0].author.role == Role.TOOL + assert messages[0].author.name == "functions.empty_tool" assert messages[0].content[0].text == "" - def test_system_message(self): - """Test parsing system message.""" - chat_msg = { - "role": "system", - "content": "You are a helpful assistant", - } - messages = parse_input_to_harmony_message(chat_msg) +class TestParseChatInputToHarmonyMessage: + """ + Tests for scenarios that are specific to the Chat Completion API + parse_chat_input_to_harmony_message function. + """ - assert len(messages) == 1 - # System messages are converted using Message.from_dict - # which should preserve the role - assert messages[0].author.role == Role.SYSTEM - - def test_developer_message(self): - """Test parsing developer message.""" - chat_msg = { - "role": "developer", - "content": "Use concise language", - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.DEVELOPER - - def test_user_message_with_string_content(self): - """Test parsing user message with string content.""" - chat_msg = { - "role": "user", - "content": "What's the weather in San Francisco?", - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.USER - assert messages[0].content[0].text == "What's the weather in San Francisco?" - - def test_user_message_with_array_content(self): - """Test parsing user message with array content.""" - chat_msg = { - "role": "user", - "content": [ - {"text": "What's in this image? "}, - {"text": "Please describe it."}, - ], - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.USER - assert len(messages[0].content) == 2 - assert messages[0].content[0].text == "What's in this image? " - assert messages[0].content[1].text == "Please describe it." - - def test_assistant_message_with_string_content(self): - """Test parsing assistant message with string content (no tool calls).""" - chat_msg = { - "role": "assistant", - "content": "Hello! How can I help you today?", - } - - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.ASSISTANT - assert messages[0].content[0].text == "Hello! How can I help you today?" - - def test_pydantic_model_input(self): - """Test parsing Pydantic model input (has model_dump method).""" - - class MockPydanticModel: - def model_dump(self, exclude_none=True): - return { - "role": "user", - "content": "Test message", - } - - chat_msg = MockPydanticModel() - messages = parse_input_to_harmony_message(chat_msg) - - assert len(messages) == 1 - assert messages[0].author.role == Role.USER - assert messages[0].content[0].text == "Test message" - - def test_message_with_empty_content(self): - """Test parsing message with empty string content.""" + def test_user_message_with_empty_content(self): chat_msg = { "role": "user", "content": "", } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_chat_input_to_harmony_message(chat_msg) - assert len(messages) == 1 - assert messages[0].content[0].text == "" + verify_harmony_messages( + messages, + [ + { + "role": "user", + "content": "", + }, + ], + ) - def test_tool_call_with_missing_function_fields(self): - """Test parsing tool call with missing name or arguments.""" + def test_user_message_with_none_content(self): + chat_msg = { + "role": "user", + "content": None, + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "user", + "content": "", + }, + ], + ) + + def test_assistant_message_with_empty_content(self): + chat_msg = { + "role": "assistant", + "content": "", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + assert len(messages) == 0 + + def test_assistant_message_with_none_content(self): + chat_msg = { + "role": "assistant", + "content": None, + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + assert len(messages) == 0 + + def test_assistant_message_with_content_but_empty_reasoning(self): + chat_msg = { + "role": "assistant", + "content": "The answer is 4.", + "reasoning": "", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "final", + "content": "The answer is 4.", + }, + ], + ) + + def test_assistant_message_with_reasoning_but_empty_content(self): + chat_msg = { + "role": "assistant", + "reasoning": "I'm thinking about the user's question.", + "content": "", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "analysis", + "content": "I'm thinking about the user's question.", + }, + ], + ) + + def test_assistant_message_with_reasoning_but_none_content(self): + chat_msg = { + "role": "assistant", + "reasoning": "I'm thinking about the user's question.", + "content": None, + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "analysis", + "content": "I'm thinking about the user's question.", + }, + ], + ) + + def test_assistant_message_with_tool_calls_but_no_content(self): chat_msg = { "role": "assistant", "tool_calls": [ { - "function": {} # Missing both name and arguments + "function": { + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + } } ], } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_chat_input_to_harmony_message(chat_msg) - assert len(messages) == 1 - assert messages[0].recipient == "functions." - assert messages[0].content[0].text == "" + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": '{"location": "San Francisco"}', + "content_type": "json", + }, + ], + ) - def test_array_content_with_missing_text(self): - """Test parsing array content where text field is missing.""" + def test_assistant_message_with_tool_calls_and_content(self): chat_msg = { - "role": "user", + "role": "assistant", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + } + } + ], + "content": "I'll call the tool.", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "commentary", + "content": "I'll call the tool.", + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": '{"location": "San Francisco"}', + "content_type": "json", + }, + ], + ) + + def test_assistant_message_with_tool_calls_and_reasoning(self): + chat_msg = { + "role": "assistant", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + } + } + ], + "reasoning": "I should use the get_weather tool.", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "analysis", + "content": "I should use the get_weather tool.", + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": '{"location": "San Francisco"}', + "content_type": "json", + }, + ], + ) + + def test_assistant_message_with_tool_calls_and_reasoning_and_content(self): + chat_msg = { + "role": "assistant", + "tool_calls": [ + { + "function": { + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + } + } + ], + "reasoning": "I should use the get_weather tool.", + "content": "I'll call the tool.", + } + + messages = parse_chat_input_to_harmony_message(chat_msg) + + verify_harmony_messages( + messages, + [ + { + "role": "assistant", + "channel": "commentary", + "content": "I'll call the tool.", + }, + { + "role": "assistant", + "channel": "analysis", + "content": "I should use the get_weather tool.", + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": '{"location": "San Francisco"}', + "content_type": "json", + }, + ], + ) + + def test_tool_message_with_string_content(self): + tool_id_names = { + "call_123": "get_weather", + } + chat_msg = { + "role": "tool", + "tool_call_id": "call_123", + "content": "The weather in San Francisco is sunny, 72°F", + } + + messages = parse_chat_input_to_harmony_message( + chat_msg, tool_id_names=tool_id_names + ) + + verify_harmony_messages( + messages, + [ + { + "role": "tool", + "name": "functions.get_weather", + "content": "The weather in San Francisco is sunny, 72°F", + "channel": "commentary", + }, + ], + ) + + def test_tool_message_with_array_content(self): + tool_id_names = { + "call_123": "search_results", + } + chat_msg = { + "role": "tool", + "tool_call_id": "call_123", "content": [ - {}, # Missing text field - {"text": "actual text"}, + {"type": "text", "text": "Result 1: "}, + {"type": "text", "text": "Result 2: "}, + { + "type": "image", + "url": "http://example.com/img.png", + }, # Should be ignored + {"type": "text", "text": "Result 3"}, ], } - messages = parse_input_to_harmony_message(chat_msg) + messages = parse_chat_input_to_harmony_message( + chat_msg, tool_id_names=tool_id_names + ) - assert len(messages) == 1 - assert len(messages[0].content) == 2 - assert messages[0].content[0].text == "" - assert messages[0].content[1].text == "actual text" + verify_harmony_messages( + messages, + [ + { + "role": "tool", + "name": "functions.search_results", + "content": "Result 1: Result 2: Result 3", + "channel": "commentary", + }, + ], + ) + + def test_tool_message_with_empty_content(self): + tool_id_names = { + "call_123": "empty_tool", + } + chat_msg = { + "role": "tool", + "tool_call_id": "call_123", + "content": "", + } + + messages = parse_chat_input_to_harmony_message( + chat_msg, tool_id_names=tool_id_names + ) + + verify_harmony_messages( + messages, + [ + { + "role": "tool", + "name": "functions.empty_tool", + "content": "", + "channel": "commentary", + }, + ], + ) + + def test_tool_message_with_none_content(self): + tool_id_names = { + "call_123": "empty_tool", + } + chat_msg = { + "role": "tool", + "tool_call_id": "call_123", + "content": None, + } + + messages = parse_chat_input_to_harmony_message( + chat_msg, tool_id_names=tool_id_names + ) + + verify_harmony_messages( + messages, + [ + { + "role": "tool", + "name": "functions.empty_tool", + "content": "", + "channel": "commentary", + }, + ], + ) + + +class TestAutoDropAnalysisMessages: + def test_no_analysis_messages(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + assert cleaned_messages == messages + + def test_only_analysis_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + assert cleaned_messages == messages + + def test_multiple_analysis_messages_without_final_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking even more." + ).with_channel("analysis"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + assert cleaned_messages == messages + + def test_only_final_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + assert cleaned_messages == messages + + def test_drops_one_analysis_messages_before_final_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + Message.from_role_and_content( + Role.ASSISTANT, "I should think harder." + ).with_channel("analysis"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + # Should have dropped the first analysis message + assert cleaned_messages == messages[1:] + + def test_drops_all_analysis_messages_before_final_message(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking even more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + Message.from_role_and_content( + Role.ASSISTANT, "I should think harder." + ).with_channel("analysis"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + # Should have dropped the first 3 analysis messages + assert cleaned_messages == messages[3:] + + def test_multiple_analysis_messages_with_multiple_final_messages(self) -> None: + messages = [ + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking about the user's question." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "I'm thinking even more." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + Message.from_role_and_content( + Role.ASSISTANT, "I should think harder." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 5." + ).with_channel("final"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + # Should have dropped all those analysis messages + assert len(cleaned_messages) == 2 + assert cleaned_messages[0].content[0].text == "The answer is 4." + assert cleaned_messages[1].content[0].text == "The answer is 5." + + def test_drops_non_assistant_analysis_messages(self) -> None: + messages = [ + Message.from_role_and_content( + Role.TOOL, "The tool thinks we should think harder." + ).with_channel("analysis"), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + ] + cleaned_messages = auto_drop_analysis_messages(messages) + # Should have dropped the analysis message + assert cleaned_messages == messages[1:] + + +class TestParseChatOutput: + def test_parse_chat_output_interrupted_first_message(self) -> None: + harmony_str = "<|channel|>final<|message|>I'm in the middle of answering" + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "I'm in the middle of answering" + + def test_parse_chat_output_interrupted_reasoning_first_message(self) -> None: + harmony_str = "<|channel|>analysis<|message|>I'm in the middle of thinking" + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning == "I'm in the middle of thinking" + assert final_content is None + + def test_parse_chat_output_complete_reasoning_interrupted_content(self) -> None: + harmony_str = ( + "<|channel|>analysis<|message|>I'm thinking.<|end|>" + "<|start|>assistant<|channel|>final" + "<|message|>I'm in the middle of answering" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning == "I'm thinking." + assert final_content == "I'm in the middle of answering" + + def test_parse_chat_output_complete_content(self) -> None: + harmony_str = "<|channel|>final<|message|>The answer is 4.<|end|>" + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "The answer is 4." + + def test_parse_chat_output_complete_commentary(self) -> None: + harmony_str = ( + "<|channel|>commentary<|message|>I need to call some tools.<|end|>" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning is None + assert final_content == "I need to call some tools." + + def test_parse_chat_output_complete_reasoning(self) -> None: + harmony_str = ( + "<|channel|>analysis<|message|>I've thought hard about this.<|end|>" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning == "I've thought hard about this." + assert final_content is None + + def test_parse_chat_output_complete_reasoning_and_content(self) -> None: + harmony_str = ( + "<|channel|>analysis<|message|>I've thought hard about this.<|end|>" + "<|start|>assistant<|channel|>final<|message|>The answer is 4.<|end|>" + ) + token_ids = get_encoding().encode(harmony_str, allowed_special="all") + reasoning, final_content, _ = parse_chat_output(token_ids) + assert reasoning == "I've thought hard about this." + assert final_content == "The answer is 4." class TestParseOutputMessage: diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 9ea65f9fa6e7a..5a9293f1b9ae5 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -11,13 +11,25 @@ import pytest_asyncio from openai import OpenAI from vllm.config.multimodal import MultiModalConfig -from vllm.entrypoints.openai.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.parser.harmony_utils import get_encoding +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + RequestResponseMetadata, +) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.entrypoints.openai.tool_parsers import ToolParserManager +from vllm.outputs import CompletionOutput, RequestOutput from vllm.tokenizers import get_tokenizer from vllm.v1.engine.async_llm import AsyncLLM from ...utils import RemoteOpenAIServer +from .utils import ( + accumulate_streaming_response, + verify_chat_response, + verify_harmony_messages, +) GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b" @@ -728,3 +740,635 @@ async def test_serving_chat_data_parallel_rank_extraction(): # Verify that data_parallel_rank defaults to None assert "data_parallel_rank" in mock_engine.generate.call_args.kwargs assert mock_engine.generate.call_args.kwargs["data_parallel_rank"] is None + + +class TestServingChatWithHarmony: + """ + These tests ensure Chat Completion requests are being properly converted into + Harmony messages and Harmony response messages back into Chat Completion responses. + These tests are not exhaustive, but each one was created to cover a specific case + that we got wrong but is now fixed. + + Any changes to the tests and their expectations may result in changes to the + accuracy of model prompting and responses generated. It is suggested to run + an evaluation or benchmarking suite (such as bfcl multi_turn) to understand + any impact of changes in how we prompt Harmony models. + """ + + @pytest.fixture(params=[False, True], ids=["non_streaming", "streaming"]) + def stream(self, request) -> bool: + """Parameterize tests to run in both non-streaming and streaming modes.""" + return request.param + + @pytest.fixture() + def mock_engine(self) -> AsyncLLM: + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + return mock_engine + + @pytest.fixture() + def serving_chat(self, mock_engine) -> OpenAIServingChat: + chat = _build_serving_chat(mock_engine) + chat.use_harmony = True + chat.tool_parser = ToolParserManager.get_tool_parser("openai") + return chat + + def mock_request_output_from_req_and_token_ids( + self, req: ChatCompletionRequest, token_ids: list[int], finished: bool = False + ) -> RequestOutput: + # Our tests don't use most fields, so just get the token ids correct + completion_output = CompletionOutput( + index=0, + text="", + token_ids=token_ids, + cumulative_logprob=0.0, + logprobs=None, + ) + return RequestOutput( + request_id=req.request_id, + prompt=[], + prompt_token_ids=[], + prompt_logprobs=None, + outputs=[completion_output], + finished=finished, + ) + + @pytest.fixture + def weather_tools(self) -> list[dict[str, Any]]: + return [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"}, + }, + "required": ["location"], + }, + }, + }, + ] + + @pytest.fixture + def weather_messages_start(self) -> list[dict[str, Any]]: + return [ + { + "role": "user", + "content": "What's the weather like in Paris today?", + }, + ] + + async def generate_response_from_harmony_str( + self, + serving_chat: OpenAIServingChat, + req: ChatCompletionRequest, + harmony_str: str, + stream: bool = False, + ) -> ChatCompletionResponse: + harmony_token_ids = get_encoding().encode(harmony_str, allowed_special="all") + + async def result_generator(): + if stream: + for token_id in harmony_token_ids: + yield self.mock_request_output_from_req_and_token_ids( + req, [token_id] + ) + yield self.mock_request_output_from_req_and_token_ids( + req, [], finished=True + ) + else: + yield self.mock_request_output_from_req_and_token_ids( + req, harmony_token_ids, finished=True + ) + + generator_func = ( + serving_chat.chat_completion_stream_generator + if stream + else serving_chat.chat_completion_full_generator + ) + + result = generator_func( + request=req, + result_generator=result_generator(), + request_id=req.request_id, + model_name=req.model, + conversation=[], + tokenizer=get_tokenizer(req.model), + request_metadata=RequestResponseMetadata( + request_id=req.request_id, + model_name=req.model, + ), + ) + + if stream: + return await accumulate_streaming_response(result) + return await result + + @pytest.mark.asyncio + async def test_simple_chat(self, serving_chat, stream): + messages = [{"role": "user", "content": "what is 1+1?"}] + + # Test the Harmony messages for the first turn's input + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user", "content": messages[0]["content"]}, + ], + ) + + # Test the Chat Completion response for the first turn's output + reasoning_str = "We need to think really hard about this." + final_str = "The answer is 2." + response_str = ( + f"<|channel|>analysis<|message|>{reasoning_str}<|end|>" + f"<|start|>assistant<|channel|>final<|message|>{final_str}<|end|>" + ) + response = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response(response, content=final_str, reasoning=reasoning_str) + + # Add the output messages from the first turn as input to the second turn + for choice in response.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Test the Harmony messages for the second turn's input + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2) + verify_harmony_messages( + input_messages_2, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + # The analysis message should be dropped on subsequent inputs because + # of the subsequent assistant message to the final channel. + {"role": "assistant", "channel": "final", "content": final_str}, + ], + ) + + @pytest.mark.asyncio + async def test_tool_call_response_with_content( + self, serving_chat, stream, weather_tools, weather_messages_start + ): + tools = weather_tools + messages = list(weather_messages_start) + + # Test the Harmony messages for the first turn's input + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer", "tool_definitions": ["get_weather"]}, + {"role": "user", "content": messages[0]["content"]}, + ], + ) + + # Test the Chat Completion response for the first turn's output + commentary_str = "We'll call get_weather." + tool_args_str = '{"location": "Paris"}' + response_str = ( + f"<|channel|>commentary<|message|>{commentary_str}<|end|>" + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + f"<|constrain|>json<|message|>{tool_args_str}<|call|>" + ) + response = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response( + response, + content=commentary_str, + tool_calls=[("get_weather", tool_args_str)], + ) + + tool_call = response.choices[0].message.tool_calls[0] + + # Add the output messages from the first turn as input to the second turn + for choice in response.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add our tool output message + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": "20 degrees Celsius", + }, + ) + + # Test the Harmony messages for the second turn's input + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2) + verify_harmony_messages( + input_messages_2, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + { + "role": "assistant", + "channel": "commentary", + "content": commentary_str, + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "20 degrees Celsius", + }, + ], + ) + + @pytest.mark.asyncio + async def test_tools_and_reasoning( + self, serving_chat, stream, weather_tools, weather_messages_start + ): + tools = weather_tools + messages = list(weather_messages_start) + + # Test the Harmony messages for the first turn's input + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer", "tool_definitions": ["get_weather"]}, + {"role": "user", "content": messages[0]["content"]}, + ], + ) + + # Test the Chat Completion response for the first turn's output + reasoning_str = "I'll call get_weather." + tool_args_str = '{"location": "Paris"}' + response_str = ( + f"<|channel|>analysis<|message|>{reasoning_str}<|end|>" + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + f"<|constrain|>json<|message|>{tool_args_str}<|call|>" + ) + response = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response( + response, + reasoning=reasoning_str, + tool_calls=[("get_weather", tool_args_str)], + ) + + tool_call = response.choices[0].message.tool_calls[0] + + # Add the output messages from the first turn as input to the second turn + for choice in response.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add our tool output message + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": "20 degrees Celsius", + }, + ) + + # Test the Harmony messages for the second turn's input + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2) + verify_harmony_messages( + input_messages_2, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + { + "role": "assistant", + "channel": "analysis", + "content": reasoning_str, + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "20 degrees Celsius", + }, + ], + ) + + @pytest.mark.asyncio + async def test_multi_turn_tools_and_reasoning( + self, serving_chat, stream, weather_tools, weather_messages_start + ): + tools = weather_tools + messages = list(weather_messages_start) + + # Test the Harmony messages for the first turn's input + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer", "tool_definitions": ["get_weather"]}, + {"role": "user", "content": messages[0]["content"]}, + ], + ) + + # Test the Chat Completion response for the first turn's output + reasoning_str = "I'll call get_weather." + paris_tool_args_str = '{"location": "Paris"}' + response_str = ( + f"<|channel|>analysis<|message|>{reasoning_str}<|end|>" + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + f"<|constrain|>json<|message|>{paris_tool_args_str}<|call|>" + ) + response = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response( + response, + reasoning=reasoning_str, + tool_calls=[("get_weather", paris_tool_args_str)], + ) + + tool_call = response.choices[0].message.tool_calls[0] + + # Add the output messages from the first turn as input to the second turn + for choice in response.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add our tool output message + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": "20 degrees Celsius", + }, + ) + + # Test the Harmony messages for the second turn's input + req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_2, _, _ = serving_chat._make_request_with_harmony(req_2) + verify_harmony_messages( + input_messages_2, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + { + "role": "assistant", + "channel": "analysis", + "content": reasoning_str, + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": paris_tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "20 degrees Celsius", + }, + ], + ) + + # Test the Chat Completion response for the second turn's output + paris_weather_str = "The weather in Paris today is 20 degrees Celsius." + response_str = f"<|channel|>final<|message|>{paris_weather_str}<|end|>" + response_2 = await self.generate_response_from_harmony_str( + serving_chat, req_2, response_str, stream=stream + ) + verify_chat_response(response_2, content=paris_weather_str) + + # Add the output messages from the second turn as input to the third turn + for choice in response_2.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add a new user message for the third turn + messages.append( + { + "role": "user", + "content": "What's the weather like in Boston today?", + }, + ) + + # Test the Harmony messages for the third turn's input + req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_3, _, _ = serving_chat._make_request_with_harmony(req_3) + verify_harmony_messages( + input_messages_3, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": paris_tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "20 degrees Celsius", + }, + { + "role": "assistant", + "channel": "final", + "content": paris_weather_str, + }, + {"role": "user", "content": messages[-1]["content"]}, + ], + ) + + # Test the Chat Completion response for the third turn's output + reasoning_str = "I'll call get_weather." + boston_tool_args_str = '{"location": "Boston"}' + response_str = ( + f"<|channel|>analysis<|message|>{reasoning_str}<|end|>" + "<|start|>assistant to=functions.get_weather<|channel|>commentary" + f"<|constrain|>json<|message|>{boston_tool_args_str}<|call|>" + ) + response_3 = await self.generate_response_from_harmony_str( + serving_chat, req, response_str, stream=stream + ) + verify_chat_response( + response_3, + reasoning=reasoning_str, + tool_calls=[("get_weather", boston_tool_args_str)], + ) + + tool_call = response_3.choices[0].message.tool_calls[0] + + # Add the output messages from the third turn as input to the fourth turn + for choice in response_3.choices: + messages.append(choice.message.model_dump(exclude_none=True)) + + # Add our tool output message + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": "10 degrees Celsius", + }, + ) + + # Test the Harmony messages for the fourth turn's input + req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages_4, _, _ = serving_chat._make_request_with_harmony(req_4) + verify_harmony_messages( + input_messages_4, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user"}, + {"role": "assistant"}, + {"role": "tool"}, + { + "role": "assistant", + "channel": "final", + }, + {"role": "user"}, + { + "role": "assistant", + "channel": "analysis", + "content": reasoning_str, + }, + { + "role": "assistant", + "channel": "commentary", + "recipient": "functions.get_weather", + "content": boston_tool_args_str, + }, + { + "role": "tool", + "author_name": "functions.get_weather", + "channel": "commentary", + "recipient": "assistant", + "content": "10 degrees Celsius", + }, + ], + ) + + @pytest.mark.asyncio + async def test_non_tool_reasoning(self, serving_chat): + messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": "What's 2+2?", + }, + { + "role": "assistant", + "reasoning": "Adding 2 and 2 is easy. The result is 4.", + "content": "4", + }, + ] + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user", "content": messages[0]["content"]}, + # The reasoning that would have resulted in an analysis message is + # dropped because of a later assistant message to the final channel. + { + "role": "assistant", + "channel": "final", + "content": messages[1]["content"], + }, + ], + ) + + @pytest.mark.asyncio + async def test_non_tool_reasoning_empty_content(self, serving_chat): + messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": "What's 2+2?", + }, + { + "role": "assistant", + "reasoning": "Adding 2 and 2 is easy. The result is 4.", + "content": "", + }, + ] + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user", "content": messages[0]["content"]}, + { + "role": "assistant", + "channel": "analysis", + "content": messages[1]["reasoning"], + }, + ], + ) + + @pytest.mark.asyncio + async def test_non_tool_reasoning_empty_content_list(self, serving_chat): + messages: list[dict[str, Any]] = [ + { + "role": "user", + "content": "What's 2+2?", + }, + { + "role": "assistant", + "reasoning": "Adding 2 and 2 is easy. The result is 4.", + "content": [], + }, + ] + req = ChatCompletionRequest(model=MODEL_NAME, messages=messages) + input_messages, _, _ = serving_chat._make_request_with_harmony(req) + + verify_harmony_messages( + input_messages, + [ + {"role": "system"}, + {"role": "developer"}, + {"role": "user", "content": messages[0]["content"]}, + { + "role": "assistant", + "channel": "analysis", + "content": messages[1]["reasoning"], + }, + ], + ) diff --git a/tests/entrypoints/openai/utils.py b/tests/entrypoints/openai/utils.py new file mode 100644 index 0000000000000..501f6dcc91543 --- /dev/null +++ b/tests/entrypoints/openai/utils.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +from collections.abc import AsyncGenerator +from typing import Any + +from vllm.entrypoints.openai.protocol import ( + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatCompletionStreamResponse, + ChatMessage, + UsageInfo, +) + + +async def accumulate_streaming_response( + stream_generator: AsyncGenerator[str, None], +) -> ChatCompletionResponse: + """ + Accumulate streaming SSE chunks into a complete ChatCompletionResponse. + + This helper parses the SSE format and builds up the complete response + by combining all the delta chunks. + """ + accumulated_content = "" + accumulated_reasoning = None + accumulated_tool_calls: list[dict[str, Any]] = [] + role = None + finish_reason = None + response_id = None + created = None + model = None + index = 0 + + async for chunk_str in stream_generator: + # Skip empty lines and [DONE] marker + if not chunk_str.strip() or chunk_str.strip() == "data: [DONE]": + continue + + # Parse SSE format: "data: {json}\n\n" + if chunk_str.startswith("data: "): + json_str = chunk_str[6:].strip() + try: + chunk_data = json.loads(json_str) + # print(f"DEBUG: Parsed chunk_data: {chunk_data}") + chunk = ChatCompletionStreamResponse(**chunk_data) + + # Store metadata from first chunk + if response_id is None: + response_id = chunk.id + created = chunk.created + model = chunk.model + + # Process each choice in the chunk + for choice in chunk.choices: + if choice.delta.role: + role = choice.delta.role + if choice.delta.content: + accumulated_content += choice.delta.content + if choice.delta.reasoning: + if accumulated_reasoning is None: + accumulated_reasoning = "" + accumulated_reasoning += choice.delta.reasoning + if choice.delta.tool_calls: + # Accumulate tool calls + for tool_call_delta in choice.delta.tool_calls: + # Find or create the tool call at this index + while len(accumulated_tool_calls) <= tool_call_delta.index: + accumulated_tool_calls.append( + { + "id": None, + "type": "function", + "function": {"name": "", "arguments": ""}, + } + ) + + if tool_call_delta.id: + accumulated_tool_calls[tool_call_delta.index]["id"] = ( + tool_call_delta.id + ) + if tool_call_delta.function: + if tool_call_delta.function.name: + accumulated_tool_calls[tool_call_delta.index][ + "function" + ]["name"] += tool_call_delta.function.name + if tool_call_delta.function.arguments: + accumulated_tool_calls[tool_call_delta.index][ + "function" + ]["arguments"] += tool_call_delta.function.arguments + + if choice.finish_reason: + finish_reason = choice.finish_reason + if choice.index is not None: + index = choice.index + + except json.JSONDecodeError: + continue + + # Build the final message + message_kwargs = { + "role": role or "assistant", + "content": accumulated_content if accumulated_content else None, + "reasoning": accumulated_reasoning, + } + + # Only include tool_calls if there are any + if accumulated_tool_calls: + message_kwargs["tool_calls"] = [ + {"id": tc["id"], "type": tc["type"], "function": tc["function"]} + for tc in accumulated_tool_calls + ] + + message = ChatMessage(**message_kwargs) + + # Build the final response + choice = ChatCompletionResponseChoice( + index=index, + message=message, + finish_reason=finish_reason or "stop", + ) + + # Create usage info (with dummy values for tests) + usage = UsageInfo( + prompt_tokens=0, + completion_tokens=0, + total_tokens=0, + ) + + response = ChatCompletionResponse( + id=response_id or "chatcmpl-test", + object="chat.completion", + created=created or 0, + model=model or "test-model", + choices=[choice], + usage=usage, + ) + + return response + + +def verify_harmony_messages( + messages: list[Any], expected_messages: list[dict[str, Any]] +): + assert len(messages) == len(expected_messages) + for msg, expected in zip(messages, expected_messages): + if "role" in expected: + assert msg.author.role == expected["role"] + if "author_name" in expected: + assert msg.author.name == expected["author_name"] + if "channel" in expected: + assert msg.channel == expected["channel"] + if "recipient" in expected: + assert msg.recipient == expected["recipient"] + if "content" in expected: + assert msg.content[0].text == expected["content"] + if "content_type" in expected: + assert msg.content_type == expected["content_type"] + if "tool_definitions" in expected: + # Check that the tool definitions match the expected list of tool names + actual_tools = [t.name for t in msg.content[0].tools["functions"].tools] + assert actual_tools == expected["tool_definitions"] + + +def verify_chat_response( + response: ChatCompletionResponse, + content: str | None = None, + reasoning: str | None = None, + tool_calls: list[tuple[str, str]] | None = None, +): + assert len(response.choices) == 1 + message = response.choices[0].message + + if content is not None: + assert message.content == content + else: + assert not message.content + + if reasoning is not None: + assert message.reasoning == reasoning + else: + assert not message.reasoning + + if tool_calls: + assert message.tool_calls is not None + assert len(message.tool_calls) == len(tool_calls) + for tc, (expected_name, expected_args) in zip(message.tool_calls, tool_calls): + assert tc.function.name == expected_name + assert tc.function.arguments == expected_args + else: + assert not message.tool_calls diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 2260e9604c3ed..376d97a03964e 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -232,7 +232,177 @@ def parse_response_input( return msg +def parse_chat_inputs_to_harmony_messages(chat_msgs: list) -> list[Message]: + """ + Parse a list of messages from request.messages in the Chat Completion API to + Harmony messages. + """ + msgs: list[Message] = [] + tool_id_names: dict[str, str] = {} + + # Collect tool id to name mappings for tool response recipient values + for chat_msg in chat_msgs: + for tool_call in chat_msg.get("tool_calls", []): + tool_id_names[tool_call.get("id")] = tool_call.get("function", {}).get( + "name" + ) + + for chat_msg in chat_msgs: + msgs.extend(parse_chat_input_to_harmony_message(chat_msg, tool_id_names)) + + msgs = auto_drop_analysis_messages(msgs) + return msgs + + +def auto_drop_analysis_messages(msgs: list[Message]) -> list[Message]: + """ + Harmony models expect the analysis messages (representing raw chain of thought) to + be dropped after an assistant message to the final channel is produced from the + reasoning of those messages. + + The openai-harmony library does this if the very last assistant message is to the + final channel, but it does not handle the case where we're in longer multi-turn + conversations and the client gave us reasoning content from previous turns of + the conversation with multiple assistant messages to the final channel in the + conversation. + + So, we find the index of the last assistant message to the final channel and drop + all analysis messages that precede it, leaving only the analysis messages that + are relevant to the current part of the conversation. + """ + last_assistant_final_index = -1 + for i in range(len(msgs) - 1, -1, -1): + msg = msgs[i] + if msg.author.role == "assistant" and msg.channel == "final": + last_assistant_final_index = i + break + + cleaned_msgs: list[Message] = [] + for i, msg in enumerate(msgs): + if i < last_assistant_final_index and msg.channel == "analysis": + continue + cleaned_msgs.append(msg) + + return cleaned_msgs + + +def flatten_chat_text_content(content: str | list | None) -> str | None: + """ + Extract the text parts from a chat message content field and flatten them + into a single string. + """ + if isinstance(content, list): + return "".join( + item.get("text", "") + for item in content + if isinstance(item, dict) and item.get("type") == "text" + ) + return content + + +def parse_chat_input_to_harmony_message( + chat_msg, tool_id_names: dict[str, str] | None = None +) -> list[Message]: + """ + Parse a message from request.messages in the Chat Completion API to + Harmony messages. + """ + tool_id_names = tool_id_names or {} + + if not isinstance(chat_msg, dict): + # Handle Pydantic models + chat_msg = chat_msg.model_dump(exclude_none=True) + + role = chat_msg.get("role") + msgs: list[Message] = [] + + # Assistant message with tool calls + tool_calls = chat_msg.get("tool_calls", []) + + if role == "assistant" and tool_calls: + content = flatten_chat_text_content(chat_msg.get("content")) + if content: + commentary_msg = Message.from_role_and_content(Role.ASSISTANT, content) + commentary_msg = commentary_msg.with_channel("commentary") + msgs.append(commentary_msg) + + reasoning_content = chat_msg.get("reasoning") or chat_msg.get( + "reasoning_content" + ) + if reasoning_content: + analysis_msg = Message.from_role_and_content( + Role.ASSISTANT, reasoning_content + ) + analysis_msg = analysis_msg.with_channel("analysis") + msgs.append(analysis_msg) + + for call in tool_calls: + func = call.get("function", {}) + name = func.get("name", "") + arguments = func.get("arguments", "") or "" + msg = Message.from_role_and_content(Role.ASSISTANT, arguments) + msg = msg.with_channel("commentary") + msg = msg.with_recipient(f"functions.{name}") + # Officially, this should be `<|constrain|>json` but there is not clear + # evidence that improves accuracy over `json` and some anecdotes to the + # contrary. Further testing of the different content_types is needed. + msg = msg.with_content_type("json") + msgs.append(msg) + return msgs + + # Tool role message (tool output) + if role == "tool": + tool_call_id = chat_msg.get("tool_call_id", "") + name = tool_id_names.get(tool_call_id, "") + content = chat_msg.get("content", "") or "" + content = flatten_chat_text_content(content) + + msg = ( + Message.from_author_and_content( + Author.new(Role.TOOL, f"functions.{name}"), content + ) + .with_channel("commentary") + .with_recipient("assistant") + ) + return [msg] + + # Non-tool reasoning content + reasoning_content = chat_msg.get("reasoning") or chat_msg.get("reasoning_content") + if role == "assistant" and reasoning_content: + analysis_msg = Message.from_role_and_content(Role.ASSISTANT, reasoning_content) + analysis_msg = analysis_msg.with_channel("analysis") + msgs.append(analysis_msg) + + # Default: user/assistant/system messages with content + content = chat_msg.get("content") or "" + if content is None: + content = "" + if isinstance(content, str): + contents = [TextContent(text=content)] + else: + # TODO: Support refusal. + contents = [TextContent(text=c.get("text", "")) for c in content] + + # Only add assistant messages if they have content, as reasoning or tool calling + # assistant messages were already added above. + if role == "assistant" and contents and contents[0].text: + msg = Message.from_role_and_contents(role, contents) + # Send non-tool assistant messages to the final channel + msg = msg.with_channel("final") + msgs.append(msg) + # For user/system/developer messages, add them directly even if no content. + elif role != "assistant": + msg = Message.from_role_and_contents(role, contents) + msgs.append(msg) + + return msgs + + def parse_input_to_harmony_message(chat_msg) -> list[Message]: + """ + Parse a message from request.previous_input_messages in the Responsees API to + Harmony messages. + """ if not isinstance(chat_msg, dict): # Handle Pydantic models chat_msg = chat_msg.model_dump(exclude_none=True) @@ -258,14 +428,7 @@ def parse_input_to_harmony_message(chat_msg) -> list[Message]: if role == "tool": name = chat_msg.get("name", "") content = chat_msg.get("content", "") or "" - if isinstance(content, list): - # Handle array format for tool message content - # by concatenating all text parts. - content = "".join( - item.get("text", "") - for item in content - if isinstance(item, dict) and item.get("type") == "text" - ) + content = flatten_chat_text_content(content) msg = Message.from_author_and_content( Author.new(Role.TOOL, f"functions.{name}"), content @@ -623,20 +786,40 @@ def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser: def parse_chat_output( token_ids: Sequence[int], ) -> tuple[str | None, str | None, bool]: + """ + Parse the output of a Harmony chat completion into reasoning and final content. + Note that when the `openai` tool parser is used, serving_chat only uses this + for the reasoning content and gets the final content from the tool call parser. + + When the `openai` tool parser is not enabled, or when `GptOssReasoningParser` is + in use,this needs to return the final content without any tool calls parsed. + + Empty reasoning or final content is returned as None instead of an empty string. + """ parser = parse_output_into_messages(token_ids) output_msgs = parser.messages is_tool_call = False # TODO: update this when tool call is supported - if len(output_msgs) == 0: - # The generation has stopped during reasoning. - reasoning = parser.current_content - final_content = None - elif len(output_msgs) == 1: - # The generation has stopped during final message. - reasoning = output_msgs[0].content[0].text - final_content = parser.current_content - else: - reasoning_msg = output_msgs[:-1] - final_msg = output_msgs[-1] - reasoning = "\n".join([msg.content[0].text for msg in reasoning_msg]) - final_content = final_msg.content[0].text + + # Get completed messages from the parser + reasoning_texts = [ + msg.content[0].text for msg in output_msgs if msg.channel == "analysis" + ] + final_texts = [ + msg.content[0].text for msg in output_msgs if msg.channel != "analysis" + ] + + # Extract partial messages from the parser + if parser.current_channel == "analysis" and parser.current_content: + reasoning_texts.append(parser.current_content) + elif parser.current_channel != "analysis" and parser.current_content: + final_texts.append(parser.current_content) + + # Flatten multiple messages into a single string + reasoning: str | None = "\n".join(reasoning_texts) + final_content: str | None = "\n".join(final_texts) + + # Return None instead of empty string since existing callers check for None + reasoning = reasoning or None + final_content = final_content or None + return reasoning, final_content, is_tool_call diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 2560a5b2cdf41..d94fa7dd91937 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -27,8 +27,8 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( get_stop_tokens_for_assistant_actions, get_streamable_parser_for_assistant, get_system_message, + parse_chat_inputs_to_harmony_messages, parse_chat_output, - parse_input_to_harmony_message, render_for_completion, ) from vllm.entrypoints.openai.protocol import ( @@ -822,6 +822,9 @@ class OpenAIServingChat(OpenAIServing): if delta_message is not None: harmony_tools_streamed[i] = True + elif cur_channel == "commentary": + # Tool call preambles meant to be shown to the user + delta_message = DeltaMessage(content=delta_text) else: delta_message = None # handle streaming deltas for tools with named tool_choice @@ -1770,6 +1773,11 @@ class OpenAIServingChat(OpenAIServing): ): messages: list[OpenAIMessage] = [] + # because of issues with pydantic we need to potentially + # re-serialize the tool_calls field of the request + # for more info: see comment in `maybe_serialize_tool_calls` + maybe_serialize_tool_calls(request) + # Add system message. # NOTE: In Chat Completion API, browsing is enabled by default # if the model supports it. TODO: Support browsing. @@ -1788,8 +1796,7 @@ class OpenAIServingChat(OpenAIServing): messages.append(dev_msg) # Add user message. - for chat_msg in request.messages: - messages.extend(parse_input_to_harmony_message(chat_msg)) + messages.extend(parse_chat_inputs_to_harmony_messages(request.messages)) # Render prompt token ids. prompt_token_ids = render_for_completion(messages) diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py index 387e87f208e66..a3cf793ed3a6d 100644 --- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py @@ -43,6 +43,7 @@ class OpenAIToolParser(ToolParser): parser = parse_output_into_messages(token_ids) tool_calls = [] final_content = None + commentary_content = None if len(parser.messages) > 0: for msg in parser.messages: @@ -75,11 +76,15 @@ class OpenAIToolParser(ToolParser): ) elif msg.channel == "final": final_content = msg_text + elif msg.channel == "commentary" and not msg.recipient: + commentary_content = msg_text return ExtractedToolCallInformation( tools_called=len(tool_calls) > 0, tool_calls=tool_calls, - content=final_content, + # prefer final content over commentary content if both are present + # commentary content is tool call preambles meant to be shown to the user + content=final_content or commentary_content, ) def extract_tool_calls_streaming( From 302b2c1eb968711abe3e765f7a936dea66535907 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 12 Dec 2025 03:30:23 -0600 Subject: [PATCH 25/57] [CI/Build][AMD] Fix ref_dynamic_per_token_quant reference implementation on ROCm. (#30291) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/quant_utils.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index e29f66dca313f..7927bd0d200d8 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -30,16 +30,11 @@ def ref_dynamic_per_token_quant( if quant_dtype == torch.int8 else torch.finfo(quant_dtype) ) - qtype_traits_max = ( - ROCM_FP8FNUZ_MAX - if current_platform.is_rocm() and current_platform.is_fp8_fnuz() - else qtype_traits.max - ) - qtype_traits_min = ( - -ROCM_FP8FNUZ_MAX - if current_platform.is_rocm() and current_platform.is_fp8_fnuz() - else qtype_traits.min + use_fp8fnuz = ( + current_platform.is_fp8_fnuz() and quant_dtype == current_platform.fp8_dtype() ) + qtype_traits_max = ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.max + qtype_traits_min = -ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.min qtype_max = as_float32_tensor(qtype_traits_max) s_1 = as_float32_tensor(1.0) s_512 = as_float32_tensor(512.0) From f90319d5d14266769b65f0de28ff60b002a65fcc Mon Sep 17 00:00:00 2001 From: Jaehwang Jung Date: Fri, 12 Dec 2025 19:27:20 +0900 Subject: [PATCH 26/57] [Bugfix] Schedule failure due to wrong get_image_size_with_most_features (#29692) --- .../multimodal/processing/test_gemma3.py | 42 ++++++++++++++++++ .../multimodal/processing/test_qwen2_vl.py | 35 +++++++++++++++ vllm/model_executor/models/gemma3_mm.py | 5 ++- vllm/model_executor/models/qwen2_vl.py | 44 ++++++++++++++++--- 4 files changed, 117 insertions(+), 9 deletions(-) create mode 100644 tests/models/multimodal/processing/test_gemma3.py diff --git a/tests/models/multimodal/processing/test_gemma3.py b/tests/models/multimodal/processing/test_gemma3.py new file mode 100644 index 0000000000000..32a459ee8cdfb --- /dev/null +++ b/tests/models/multimodal/processing/test_gemma3.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.multimodal import MULTIMODAL_REGISTRY + +from ....conftest import ImageTestAssets +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["google/gemma-3-4b-it"]) +def test_get_image_size_with_most_features( + image_assets: ImageTestAssets, model_id: str +): + ctx = build_model_context( + model_id, + mm_processor_kwargs={"do_pan_and_scan": True}, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + + hf_processor_mm_kwargs: dict[str, object] = {} + hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) + + max_image_size = processor.info.get_image_size_with_most_features() + max_tokens = processor.info.get_num_image_tokens( + image_width=max_image_size.width, + image_height=max_image_size.height, + processor=hf_processor, + ) + + prompt = "" + image_seq_length = hf_processor.image_seq_length + + for asset in image_assets: + mm_data = {"image": [asset.pil_image]} + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) + mm_kwargs_data = processed_inputs["mm_kwargs"].get_data() + num_patches_tensor = mm_kwargs_data["num_patches"] + tokens = int(num_patches_tensor.item()) * image_seq_length + assert tokens <= max_tokens diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 9f4cdb6789b2c..20beaa6011b8f 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -53,3 +53,38 @@ def test_processor_override( assert img_tok_count == expected_toks_per_img * num_imgs assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs assert pixel_shape[1] == expected_pixels_shape[1] + + +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) +@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28]) +def test_get_image_size_with_most_features( + image_assets: ImageTestAssets, + model_id: str, + max_pixels: int, +): + ctx = build_model_context( + model_id, + mm_processor_kwargs={"max_pixels": max_pixels}, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + + hf_processor_mm_kwargs: dict[str, object] = {} + hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) + merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size + + max_image_size = processor.info.get_image_size_with_most_features() + max_tokens = processor.info.get_num_image_tokens( + image_width=max_image_size.width, + image_height=max_image_size.height, + image_processor=hf_processor.image_processor, + ) + + prompt = "<|vision_start|><|image_pad|><|vision_end|>" + for asset in image_assets: + mm_data = {"image": [asset.pil_image]} + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) + grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist() + t, h, w = grid_thw[0] + tokens = (t * h * w) // (merge_size**2) + assert tokens < max_tokens diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index e8dec36a1c5b8..45dfacd94431c 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -237,8 +237,9 @@ class Gemma3ProcessingInfo(BaseProcessingInfo): ) max_num_crops = images_kwargs["pan_and_scan_max_num_crops"] - # Result in the max possible feature size (h:w = max_num_crops:1) - return ImageSize(height=50 * max_num_crops, width=50) + vision_config = self.get_hf_config().vision_config + native_size = vision_config.image_size + return ImageSize(height=native_size * max_num_crops, width=native_size) class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 2c4ac2f8efff1..4e54208a59b67 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -25,6 +25,7 @@ # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" +import math from collections.abc import Callable, Iterable, Mapping, Sequence from functools import partial from typing import Annotated, Any, Literal, TypeAlias @@ -959,13 +960,42 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): return num_video_tokens def get_image_size_with_most_features(self) -> ImageSize: - max_image_size, _ = self._get_vision_info( - image_width=9999999, - image_height=9999999, - num_frames=1, - image_processor=None, - ) - return max_image_size + # NOTE: Simply processing a huge size with _get_vision_info might not give a + # size that maximizes the number of featrues, i.e., the number of (merged) + # patches. This is because the number of patches limits the allowed aspect + # ratios. For example, suppose the maximum number of patches is 1280. A square + # image cannot be broken down into 1280 patches, so feeding a giant square image + # into _get_vision_info will not yield a size that maximizes the number of + # patches. Therefore, we directly factorize the maximum number of patches into + # height and width. The tricky part is to avoid extreme aspect ratios (>200 for + # qwen2-vl). If we can't find a suitable aspect ratio, we decrease the number of + # patches and retry. This is safe because the processor does not accept extreme + # aspect ratios, so there is no valid post-resize image with the number of + # patches that yields extreme aspect ratios. + + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + merge_size = vision_config.spatial_merge_size + image_processor = self.get_image_processor() + max_pixels = image_processor.max_pixels or image_processor.size["longest_edge"] + unit = patch_size * merge_size + max_seq_len = max_pixels // (unit * unit) + + def closest_factor_pair(n: int) -> tuple[int, int]: + # left <= right + for d in range(math.isqrt(n), 0, -1): + if n % d == 0: + return d, n // d + return 1, n + + height_factor, width_factor = 1, max_seq_len + for seq_len in range(max_seq_len, 0, -1): + height_factor, width_factor = closest_factor_pair(seq_len) + if width_factor / height_factor <= 200: + break + + return ImageSize(width=unit * width_factor, height=unit * height_factor) def get_max_image_tokens(self) -> int: target_width, target_height = self.get_image_size_with_most_features() From 91401c7a266450e332e88c3b569e93aeecca9a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B4=E5=9D=8E?= Date: Fri, 12 Dec 2025 18:54:52 +0800 Subject: [PATCH 27/57] [Bugfix] Fix CMakeLists Environment Variable (#21804) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: wu-kan Signed-off-by: 吴坎 Signed-off-by: wu-kan --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b93e3fe91603..cd52df86e0346 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -384,7 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}) execute_process( COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=$PYTHONPATH + PYTHONPATH=$ENV{PYTHONPATH} ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR} RESULT_VARIABLE marlin_generation_result OUTPUT_VARIABLE marlin_generation_result @@ -822,7 +822,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) execute_process( COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$ENV{PYTHONPATH} ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} RESULT_VARIABLE machete_generation_result OUTPUT_VARIABLE machete_generation_output @@ -1004,7 +1004,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}) execute_process( COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=$PYTHONPATH + PYTHONPATH=$ENV{PYTHONPATH} ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR} RESULT_VARIABLE moe_marlin_generation_result OUTPUT_VARIABLE moe_marlin_generation_output From 3e41992fecdc31ee60715bb350f18fec18ed6680 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 12 Dec 2025 08:57:47 -0500 Subject: [PATCH 28/57] [Attention] Use sparse prefill kernel for fp8 kv-cache in DeepSeek-v3.2 (#27532) Signed-off-by: Lucas Wilkinson --- csrc/cache.h | 12 +- csrc/cache_kernels.cu | 131 +++- csrc/torch_bindings.cpp | 7 + tests/conftest.py | 21 + tests/kernels/moe/test_batched_deepgemm.py | 2 +- tests/kernels/moe/test_batched_moe.py | 1 + tests/kernels/moe/test_block_fp8.py | 2 +- tests/kernels/moe/test_cutlass_moe.py | 27 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 6 + tests/kernels/moe/test_deepep_moe.py | 6 + tests/kernels/moe/test_deepgemm.py | 2 +- tests/kernels/moe/test_flashinfer.py | 1 + tests/kernels/moe/test_flashinfer_moe.py | 9 +- .../moe/test_gpt_oss_triton_kernels.py | 2 +- .../moe/test_modular_kernel_combinations.py | 6 + .../moe/test_modular_oai_triton_moe.py | 1 + tests/kernels/moe/test_moe.py | 1 + tests/kernels/moe/test_nvfp4_moe.py | 2 +- tests/kernels/moe/test_pplx_moe.py | 5 + .../v1/attention/test_sparse_mla_backends.py | 251 ++++++- vllm/_custom_ops.py | 23 + vllm/envs.py | 4 + .../layers/fused_moe/modular_kernel.py | 73 +- vllm/model_executor/models/deepseek_v2.py | 37 +- .../attention/backends/mla/flashmla_sparse.py | 665 +++++++++++++++--- vllm/v1/attention/backends/mla/indexer.py | 48 +- vllm/v1/attention/backends/utils.py | 27 + vllm/v1/worker/gpu_model_runner.py | 6 + vllm/v1/worker/gpu_worker.py | 5 + vllm/v1/worker/workspace.py | 245 +++++++ 30 files changed, 1372 insertions(+), 256 deletions(-) create mode 100644 vllm/v1/worker/workspace.py diff --git a/csrc/cache.h b/csrc/cache.h index f2a5ec0acf5cd..cbe44c09eb624 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -58,6 +59,15 @@ void cp_gather_cache( torch::Tensor const& cu_seq_lens, // [BATCH+1] int64_t batch_size, std::optional seq_starts = std::nullopt); +// Gather and upconvert FP8 KV cache to BF16 workspace +void cp_gather_and_upconvert_fp8_kv_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656] + torch::Tensor const& dst, // [TOT_TOKENS, 576] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& seq_lens, // [BATCH] + torch::Tensor const& workspace_starts, // [BATCH] + int64_t batch_size); + // Indexer K quantization and cache function void indexer_k_quant_and_cache( torch::Tensor& k, // [num_tokens, head_dim] @@ -72,4 +82,4 @@ void cp_gather_indexer_k_quant_cache( torch::Tensor& dst_k, // [num_tokens, head_dim] torch::Tensor& dst_scale, // [num_tokens, head_dim / quant_block_size * 4] const torch::Tensor& block_table, // [batch_size, num_blocks] - const torch::Tensor& cu_seq_lens); // [batch_size + 1] \ No newline at end of file + const torch::Tensor& cu_seq_lens); // [batch_size + 1] diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 8a5457206c706..f11c5f24c12ec 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include "cuda_utils.h" #include "cuda_compat.h" @@ -514,7 +515,8 @@ __global__ void indexer_k_quant_and_cache_kernel( const int quant_block_size, // quantization block size const int cache_block_size, // cache block size const int cache_stride, // stride for each token in kv_cache - const bool use_ue8m0 // use ue8m0 scale format + + const bool use_ue8m0 // use ue8m0 scale format ) { constexpr int VEC_SIZE = 4; const int64_t token_idx = blockIdx.x; @@ -1061,6 +1063,82 @@ void gather_and_maybe_dequant_cache( } namespace vllm { + +// Gather and upconvert FP8 KV cache tokens to BF16 workspace +// Similar to cp_gather_cache but specifically for FP8->BF16 conversion +__global__ void cp_gather_and_upconvert_fp8_kv_cache( + const uint8_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656] + __nv_bfloat16* __restrict__ dst, // [TOT_TOKENS, 576] + const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] + const int32_t* __restrict__ seq_lens, // [BATCH] + const int32_t* __restrict__ workspace_starts, // [BATCH] + const int32_t block_size, const int32_t head_dim, + const int64_t block_table_stride, const int64_t cache_block_stride, + const int64_t cache_entry_stride, const int64_t dst_entry_stride) { + const int64_t bid = blockIdx.x; // Batch ID + const int32_t num_splits = gridDim.y; + const int32_t split = blockIdx.y; + const int32_t seq_start = workspace_starts[bid]; + const int32_t seq_len = seq_lens[bid]; + const int32_t tot_slots = seq_len; + const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits); + + const int32_t split_start = split * split_slots; + const int32_t split_end = min((split + 1) * split_slots, tot_slots); + + const bool is_active_split = (split_start < tot_slots); + + if (!is_active_split) return; + + // Adjust the pointer for the block_table for this batch + const int32_t batch_offset = bid * block_table_stride; + int32_t offset = split_start; + int32_t offset_div = offset / block_size; + offset = offset % block_size; + const int32_t* batch_block_table = block_table + batch_offset; + + // Adjust dst pointer based on the cumulative sequence lengths + dst += seq_start * dst_entry_stride; + + const int tid = threadIdx.x; + + // Process each token in this split + for (int pid = split_start; pid < split_end; ++pid) { + auto block_id = batch_block_table[offset_div]; + const uint8_t* token_ptr = + src_cache + block_id * cache_block_stride + offset * cache_entry_stride; + __nv_bfloat16* dst_ptr = dst + pid * dst_entry_stride; + + // FP8 format: 512 bytes fp8 + 16 bytes scales + 128 bytes rope (64 bf16) + const uint8_t* no_pe_ptr = token_ptr; + const float* scales_ptr = reinterpret_cast(token_ptr + 512); + const __nv_bfloat16* rope_ptr = + reinterpret_cast(token_ptr + 512 + 16); + + // Parallelize fp8 dequant (512 elements) and rope copy (64 elements) + if (tid < 512) { + // FP8 dequantization + const int tile = tid >> 7; // each tile is 128 elements + const float scale = scales_ptr[tile]; + const uint8_t val = no_pe_ptr[tid]; + dst_ptr[tid] = + fp8::scaled_convert<__nv_bfloat16, uint8_t, + vllm::Fp8KVCacheDataType::kFp8E4M3>(val, scale); + } else if (tid < 576) { + // Rope copy (64 bf16 elements) + const int rope_idx = tid - 512; + dst_ptr[512 + rope_idx] = rope_ptr[rope_idx]; + } + + // Move to next token + offset += 1; + if (offset == block_size) { + offset_div += 1; + offset = 0; + } + } +} + template // Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by // block_size. @@ -1202,6 +1280,57 @@ void cp_gather_cache( } } +void cp_gather_and_upconvert_fp8_kv_cache( + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, 656] + torch::Tensor const& dst, // [TOT_TOKENS, 576] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& seq_lens, // [BATCH] + torch::Tensor const& workspace_starts, // [BATCH] + int64_t batch_size) { + at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + int32_t block_size = src_cache.size(1); + int32_t head_dim = dst.size(1); + + TORCH_CHECK(block_table.dtype() == torch::kInt32, + "block_table must be int32"); + TORCH_CHECK(seq_lens.dtype() == torch::kInt32, "seq_lens must be int32"); + TORCH_CHECK(workspace_starts.dtype() == torch::kInt32, + "workspace_starts must be int32"); + + TORCH_CHECK(src_cache.device() == dst.device(), + "src_cache and dst must be on the same device"); + TORCH_CHECK(src_cache.device() == block_table.device(), + "src_cache and block_table must be on the same device"); + TORCH_CHECK(src_cache.device() == seq_lens.device(), + "src_cache and seq_lens must be on the same device"); + TORCH_CHECK(src_cache.device() == workspace_starts.device(), + "src_cache and workspace_starts must be on the same device"); + + TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8"); + TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16"); + TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA"); + + int64_t block_table_stride = block_table.stride(0); + int64_t cache_block_stride = src_cache.stride(0); + int64_t cache_entry_stride = src_cache.stride(1); + int64_t dst_entry_stride = dst.stride(0); + + // Decide on the number of splits based on the batch size + int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; + dim3 grid(batch_size, num_splits); + dim3 block(576); + + vllm::cp_gather_and_upconvert_fp8_kv_cache<<>>( + src_cache.data_ptr(), + reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()), + block_table.data_ptr(), seq_lens.data_ptr(), + workspace_starts.data_ptr(), block_size, head_dim, + block_table_stride, cache_block_stride, cache_entry_stride, + dst_entry_stride); +} + // Macro to dispatch the kernel based on the data type. #define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \ vllm::indexer_k_quant_and_cache_kernel \ diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index d4c6f8c67c516..83d4943d62776 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -754,6 +754,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()"); cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache); + cache_ops.def( + "cp_gather_and_upconvert_fp8_kv_cache(Tensor src_cache, Tensor! dst, " + "Tensor block_table, Tensor seq_lens, Tensor workspace_starts, int " + "batch_size) -> ()"); + cache_ops.impl("cp_gather_and_upconvert_fp8_kv_cache", torch::kCUDA, + &cp_gather_and_upconvert_fp8_kv_cache); + cache_ops.def( "indexer_k_quant_and_cache(Tensor k, Tensor! kv_cache, Tensor " "slot_mapping, " diff --git a/tests/conftest.py b/tests/conftest.py index 5b26a02823c56..b21cfd5ba85c4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -202,6 +202,27 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): cleanup_dist_env_and_memory() +@pytest.fixture +def workspace_init(): + """Initialize the workspace manager for tests that need it. + + This fixture initializes the workspace manager with a CUDA device + if available, and resets it after the test completes. Tests that + create a full vLLM engine should NOT use this fixture as the engine + will initialize the workspace manager itself. + """ + from vllm.v1.worker.workspace import ( + init_workspace_manager, + reset_workspace_manager, + ) + + if torch.cuda.is_available(): + device = torch.device("cuda:0") + init_workspace_manager(device) + yield + reset_workspace_manager() + + @pytest.fixture(autouse=True) def dynamo_reset(): yield diff --git a/tests/kernels/moe/test_batched_deepgemm.py b/tests/kernels/moe/test_batched_deepgemm.py index 59cecd60d3d61..0ba3d8d4c958e 100644 --- a/tests/kernels/moe/test_batched_deepgemm.py +++ b/tests/kernels/moe/test_batched_deepgemm.py @@ -27,7 +27,7 @@ BLOCK_SIZE = [128, 128] @pytest.mark.parametrize("N", [512, 1024]) # intermediate dim per expert @pytest.mark.parametrize("topk", [2, 4]) def test_batched_deepgemm_vs_triton( - E: int, T: int, K: int, N: int, topk: int, monkeypatch + E: int, T: int, K: int, N: int, topk: int, monkeypatch, workspace_init ): """Compare BatchedDeepGemmExperts to BatchedTritonExperts.""" diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index dab1207d78031..2ef170f1ab308 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -248,6 +248,7 @@ def test_fused_moe_batched_experts( per_act_token_quant: bool, block_shape: list[int] | None, input_scales: bool, + workspace_init, ): """Note: float8_e4m3fn is not supported on CUDA architecture < 89, and those tests will be skipped on unsupported hardware.""" diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index b0ff1e64e3219..53a03f48e24ee 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -137,7 +137,7 @@ def setup_cuda(): @pytest.mark.parametrize("seed", SEEDS) @torch.inference_mode() def test_w8a8_block_fp8_fused_moe( - M, N, K, E, topk, block_size, dtype, seed, monkeypatch + M, N, K, E, topk, block_size, dtype, seed, monkeypatch, workspace_init ): if topk > E: pytest.skip(f"Skipping test; topk={topk} > E={E}") diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index c15837f145705..0160694d7bb54 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -274,6 +274,7 @@ def test_cutlass_moe_8_bit_no_graph( per_act_token: bool, per_out_ch: bool, monkeypatch, + workspace_init, ep_size: int | None = None, ): current_platform.seed_everything(7) @@ -329,6 +330,7 @@ def test_cutlass_moe_8_bit_cuda_graph( per_act_token: bool, per_out_ch: bool, monkeypatch, + workspace_init, ): current_platform.seed_everything(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") @@ -385,9 +387,19 @@ def test_cutlass_moe_8_bit_EP( per_out_channel: bool, ep_size: int, monkeypatch, + workspace_init, ): test_cutlass_moe_8_bit_no_graph( - m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size + m, + n, + k, + e, + topk, + per_act_token, + per_out_channel, + monkeypatch, + workspace_init, + ep_size, ) @@ -419,9 +431,19 @@ def test_cutlass_moe_8_bit_EP_large( per_out_channel: bool, ep_size: int, monkeypatch, + workspace_init, ): test_cutlass_moe_8_bit_no_graph( - m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size + m, + n, + k, + e, + topk, + per_act_token, + per_out_channel, + monkeypatch, + workspace_init, + ep_size, ) @@ -445,6 +467,7 @@ def test_run_cutlass_moe_fp8( per_act_token: bool, per_out_channel: bool, ep_size: int, + workspace_init, ): current_platform.seed_everything(7) with set_current_vllm_config(vllm_config): diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 455ecacef5ec3..f427734ef09e2 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import ( is_deep_gemm_supported, ) from vllm.utils.import_utils import has_deep_ep, has_deep_gemm +from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch @@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe( w1_scale: torch.Tensor, w2_scale: torch.Tensor, ): + device = torch.device(f"cuda:{pgi.local_rank}") + init_workspace_manager(device) + current_platform.seed_everything(pgi.rank) w1 = w1.to(device=torch.cuda.current_device()) @@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe( topk: int, world_dp_size: tuple[int, int], disable_deepgemm_ue8m0, + workspace_init, ): """ Tests for High-Throughput DeepEP + DeepGemm integration. @@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe( block_size: list[int], world_dp_size: tuple[int, int], disable_deepgemm_ue8m0, + workspace_init, ): """ Tests for Low-Latency DeepEP + DeepGemm integration. diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index d78b8250463a9..e698ca92a1515 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( ) from vllm.platforms import current_platform from vllm.utils.import_utils import has_deep_ep +from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch @@ -342,6 +343,9 @@ def _deep_ep_moe( use_fp8_dispatch: bool, per_act_token_quant: bool, ): + device = torch.device(f"cuda:{pgi.local_rank}") + init_workspace_manager(device) + if not low_latency_mode: assert not use_fp8_dispatch, ( "FP8 dispatch interface is available only in low-latency mode" @@ -437,6 +441,7 @@ def test_deep_ep_moe( topk: int, world_dp_size: tuple[int, int], per_act_token_quant: bool, + workspace_init, ): low_latency_mode = False use_fp8_dispatch = False @@ -492,6 +497,7 @@ def test_low_latency_deep_ep_moe( topk: int, world_dp_size: tuple[int, int], use_fp8_dispatch: bool, + workspace_init, ): low_latency_mode = True diff --git a/tests/kernels/moe/test_deepgemm.py b/tests/kernels/moe/test_deepgemm.py index 9b1054f7d0ab8..442b561f8f315 100644 --- a/tests/kernels/moe/test_deepgemm.py +++ b/tests/kernels/moe/test_deepgemm.py @@ -143,7 +143,7 @@ NUM_EXPERTS = [32] @pytest.mark.parametrize("topk", TOPKS) @pytest.mark.parametrize("num_experts", NUM_EXPERTS) @pytest.mark.skipif(not is_deep_gemm_supported(), reason="Requires deep_gemm kernels") -def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch): +def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch, workspace_init): with monkeypatch.context() as mp: mp.setenv("VLLM_USE_DEEP_GEMM", "1") diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index a6977f222408d..d553e2820e5ff 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -206,6 +206,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( topk: int, activation: str, monkeypatch, + workspace_init, ): current_platform.seed_everything(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index b2be03ecee2f1..133a8a4a30a60 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -51,7 +51,14 @@ MNK_FACTORS = [ @pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"]) @torch.inference_mode() def test_flashinfer_fp4_moe_no_graph( - m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, activation: str + m: int, + n: int, + k: int, + e: int, + topk: int, + dtype: torch.dtype, + activation: str, + workspace_init, ): current_platform.seed_everything(7) with set_current_vllm_config( diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index 98e80ec029777..384f43db479b5 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -269,7 +269,7 @@ class Case: ) @pytest.mark.parametrize("num_token", [2]) @pytest.mark.parametrize("tp", [1, 2, 4, 8]) -def test_equiv(num_token, a_dtype, w_dtype, tp): +def test_equiv(num_token, a_dtype, w_dtype, tp, workspace_init): from triton_kernels.tensor_details import layout if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"): diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 2a30ef2355529..6ebf1016c166c 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -16,6 +16,7 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.v1.worker.workspace import init_workspace_manager from .modular_kernel_tools.common import ( Config, @@ -77,6 +78,10 @@ def rank_worker( weights: WeightTensors, verbose: bool, ): + # Initialize workspace manager in child process + device = torch.device(f"cuda:{pgi.local_rank}") + init_workspace_manager(device) + current_platform.seed_everything(pgi.rank) # sanity check @@ -300,6 +305,7 @@ def test_modular_kernel_combinations_singlegpu( chunk_size: int | None, world_size: int, pytestconfig, + workspace_init, ): """Note: float8_e4m3fn is not supported on CUDA architecture < 89, and those tests will be skipped on unsupported hardware.""" diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py index c8616f13bbf85..1abb08f878b2b 100644 --- a/tests/kernels/moe/test_modular_oai_triton_moe.py +++ b/tests/kernels/moe/test_modular_oai_triton_moe.py @@ -209,6 +209,7 @@ def test_oai_triton_moe( num_experts: int, topk: int, unfused: bool, + workspace_init, ): current_platform.seed_everything(0) ( diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 82659276af37c..ce99d9691fdc8 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -231,6 +231,7 @@ def test_fused_moe( padding: bool, chunk_size: int, monkeypatch, + workspace_init, ): current_platform.seed_everything(7) diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index aa544fe0e0f63..e67bd76a16181 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -40,7 +40,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("dtype", [torch.bfloat16]) @torch.inference_mode() def test_cutlass_fp4_moe_no_graph( - m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype + m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init ): current_platform.seed_everything(7) with set_current_vllm_config( diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index f671b23d300ce..35e554e16cb38 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -46,6 +46,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( ) from vllm.platforms import current_platform from vllm.utils.math_utils import round_up +from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch @@ -181,6 +182,7 @@ def test_fused_moe_batched_experts( e: int, topk: int, dtype: torch.dtype, + workspace_init, ): current_platform.seed_everything(7) @@ -863,6 +865,9 @@ def _pplx_test_loop( make_weights: bool, test_fn: Callable, ): + device = torch.device(f"cuda:{pgi.local_rank}") + init_workspace_manager(device) + def format_result(msg, ex=None): if ex is not None: x = str(ex) diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index b34d587eb362d..8049347280c5a 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -22,10 +22,14 @@ from tests.v1.attention.utils import ( ) from vllm import _custom_ops as ops from vllm.attention.ops import flashmla +from vllm.config import set_current_vllm_config from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.utils.math_utils import cdiv -from vllm.v1.attention.backends.mla.flashmla_sparse import FlashMLASparseBackend -from vllm.v1.attention.backends.mla.indexer import split_prefill_chunks +from vllm.v1.attention.backends.mla.flashmla_sparse import ( + FlashMLASparseBackend, + triton_convert_req_index_to_global_index, +) +from vllm.v1.attention.backends.utils import split_prefill_chunks SPARSE_BACKEND_BATCH_SPECS = { name: BATCH_SPECS[name] @@ -114,8 +118,12 @@ def _quantize_dequantize_fp8_ds_mla( @pytest.mark.parametrize("batch_name", list(SPARSE_BACKEND_BATCH_SPECS.keys())) @pytest.mark.parametrize("kv_cache_dtype", ["fp8_ds_mla", "auto"]) @pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4]) +@pytest.mark.skipif( + torch.cuda.get_device_capability() < (9, 0), + reason="FlashMLASparseBackend requires CUDA 9.0 or higher", +) def test_sparse_backend_decode_correctness( - dist_init, batch_name, kv_cache_dtype, tensor_parallel_size + dist_init, batch_name, kv_cache_dtype, tensor_parallel_size, workspace_init ): if not torch.cuda.is_available(): pytest.skip("CUDA is required for sparse MLA decode test") @@ -320,28 +328,29 @@ def test_sparse_backend_decode_correctness( mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T.contiguous()) impl_cls = FlashMLASparseBackend.get_impl_cls() - impl = impl_cls( - num_heads=num_heads, - head_size=head_size, - scale=scale, - num_kv_heads=1, - alibi_slopes=None, - sliding_window=None, - kv_cache_dtype=vllm_config.cache_config.cache_dtype, - logits_soft_cap=None, - attn_type="decoder", - kv_sharing_target_layer_name=None, - q_lora_rank=None, - kv_lora_rank=kv_lora_rank, - qk_nope_head_dim=qk_nope_head_dim, - qk_rope_head_dim=qk_rope_head_dim, - qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, - v_head_dim=v_head_dim, - kv_b_proj=mock_kv_b_proj, - indexer=mock_indexer, - ) + with set_current_vllm_config(vllm_config): + impl = impl_cls( + num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=1, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype=vllm_config.cache_config.cache_dtype, + logits_soft_cap=None, + attn_type="decoder", + kv_sharing_target_layer_name=None, + q_lora_rank=None, + kv_lora_rank=kv_lora_rank, + qk_nope_head_dim=qk_nope_head_dim, + qk_rope_head_dim=qk_rope_head_dim, + qk_head_dim=qk_nope_head_dim + qk_rope_head_dim, + v_head_dim=v_head_dim, + kv_b_proj=mock_kv_b_proj, + indexer=mock_indexer, + ) - impl.process_weights_after_loading(dtype) + impl.process_weights_after_loading(dtype) layer = MockAttentionLayer(device) out_buffer = torch.empty( @@ -366,22 +375,192 @@ def test_sparse_backend_decode_correctness( torch.testing.assert_close(backend_output, sdpa_reference, rtol=0.5, atol=0.5) +def _triton_convert_reference_impl( + req_ids: torch.Tensor, + block_table: torch.Tensor, + token_indices: torch.Tensor, + block_size: int, + num_topk_tokens: int, + HAS_PREFILL_WORKSPACE: bool = False, + prefill_workspace_request_ids: torch.Tensor | None = None, + prefill_workspace_starts: torch.Tensor | None = None, +) -> torch.Tensor: + """Reference implementation for triton_convert_req_index_to_global_index.""" + num_tokens = req_ids.shape[0] + max_blocks_per_req = block_table.shape[1] + result = torch.empty( + num_tokens, num_topk_tokens, dtype=torch.int32, device=req_ids.device + ) + + for token_id in range(num_tokens): + req_id = req_ids[token_id].item() + + # Determine if this token uses workspace or paged cache + use_prefill_workspace = False + workspace_start = 0 + if HAS_PREFILL_WORKSPACE and prefill_workspace_request_ids is not None: + assert prefill_workspace_starts is not None + prefill_req_id = prefill_workspace_request_ids[token_id].item() + if prefill_req_id >= 0: + use_prefill_workspace = True + workspace_start = prefill_workspace_starts[prefill_req_id].item() + + for idx_id in range(num_topk_tokens): + token_idx = token_indices[token_id, idx_id].item() + + if token_idx == -1: + result[token_id, idx_id] = -1 + elif use_prefill_workspace: + # Prefill + using prefill workspace: map to workspace offset + result[token_id, idx_id] = workspace_start + token_idx + else: + # Decode: map to paged cache + block_id = token_idx // block_size + if block_id >= max_blocks_per_req: + result[token_id, idx_id] = -1 + else: + block_num = block_table[req_id, block_id].item() + offset = token_idx % block_size + result[token_id, idx_id] = block_num * block_size + offset + + return result + + +@pytest.mark.parametrize("block_size", [16, 64, 128]) +@pytest.mark.parametrize("num_topk_tokens", [128, 256, 512]) +@pytest.mark.skipif( + torch.cuda.get_device_capability() < (9, 0), + reason="FlashMLASparseBackend requires CUDA 9.0 or higher", +) +def test_triton_convert_req_index_to_global_index_decode_only( + block_size, num_topk_tokens +): + device = torch.device("cuda") + num_tokens = 8 + num_requests = 4 + max_blocks_per_req = 10 + + req_id = torch.randint( + 0, num_requests, (num_tokens,), dtype=torch.int32, device=device + ) + block_table = torch.randint( + 0, 100, (num_requests, max_blocks_per_req), dtype=torch.int32, device=device + ) + + token_indices = torch.randint( + 0, + block_size * max_blocks_per_req, + (num_tokens, num_topk_tokens), + dtype=torch.int32, + device=device, + ) + + # Set some to -1 to test masking + token_indices[0, :10] = -1 + token_indices[3, 50:60] = -1 + + # Set some to out of bounds + token_indices[2, 100:110] = max_blocks_per_req * block_size + token_indices[6, 150:160] = max_blocks_per_req * block_size + + result = triton_convert_req_index_to_global_index( + req_id, + block_table, + token_indices, + BLOCK_SIZE=block_size, + NUM_TOPK_TOKENS=num_topk_tokens, + ) + + reference_result = _triton_convert_reference_impl( + req_id, + block_table, + token_indices, + block_size, + num_topk_tokens, + ) + + torch.testing.assert_close(result, reference_result, rtol=0, atol=0) + + +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.skipif( + torch.cuda.get_device_capability() < (9, 0), + reason="FlashMLASparseBackend requires CUDA 9.0 or higher", +) +def test_triton_convert_req_index_to_global_index_with_prefill_workspace(block_size): + device = torch.device("cuda") + num_requests = 4 + max_blocks_per_req = 8 + num_topk_tokens = 128 + + # First 6 tokens are decode (reqs 0, 1), last 6 are prefill (reqs 2, 3) + req_id = torch.tensor( + [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], dtype=torch.int32, device=device + ) + prefill_workspace_request_ids = torch.tensor( + [-1, -1, -1, -1, -1, -1, 0, 0, 0, 1, 1, 1], dtype=torch.int32, device=device + ) + + # Workspace starts for the 2 prefill reqs: req 2 starts at 0, req 3 starts at 100 + prefill_workspace_starts = torch.tensor([0, 100], dtype=torch.int32, device=device) + + block_table = torch.randint( + 0, 50, (num_requests, max_blocks_per_req), dtype=torch.int32, device=device + ) + token_indices = torch.randint( + 0, + block_size * max_blocks_per_req, + (req_id.shape[0], num_topk_tokens), + dtype=torch.int32, + device=device, + ) + + # Set some to -1 to test masking + token_indices[0, :10] = -1 + token_indices[3, 50:60] = -1 + + # Set some to out of bounds + token_indices[2, 100:110] = max_blocks_per_req * block_size + token_indices[6, 150:160] = max_blocks_per_req * block_size + + result = triton_convert_req_index_to_global_index( + req_id, + block_table, + token_indices, + BLOCK_SIZE=block_size, + NUM_TOPK_TOKENS=num_topk_tokens, + HAS_PREFILL_WORKSPACE=True, + prefill_workspace_request_ids=prefill_workspace_request_ids, + prefill_workspace_starts=prefill_workspace_starts, + ) + + reference_result = _triton_convert_reference_impl( + req_id, + block_table, + token_indices, + block_size, + num_topk_tokens, + HAS_PREFILL_WORKSPACE=True, + prefill_workspace_request_ids=prefill_workspace_request_ids, + prefill_workspace_starts=prefill_workspace_starts, + ) + + torch.testing.assert_close(result, reference_result, rtol=0, atol=0) + + @pytest.mark.parametrize( - "seq_lens,max_buf,start,expected", + "seq_lens,max_buf,expected", [ # Basic split: totals per chunk ≤ max_buf - (torch.tensor([2, 3, 4, 2]), 5, 0, [(0, 2), (2, 3), (3, 4)]), - # Non-zero start index - (torch.tensor([2, 3, 4, 2]), 5, 1, [(1, 2), (2, 3), (3, 4)]), - # Exact fits should split between items when adding the next would - # overflow - (torch.tensor([5, 5, 5]), 5, 0, [(0, 1), (1, 2), (2, 3)]), + (torch.tensor([2, 3, 4, 2]), 5, [(0, 2), (2, 3), (3, 4)]), + # Exact fits should split between items when adding the next would overflow + (torch.tensor([5, 5, 5]), 5, [(0, 1), (1, 2), (2, 3)]), # All requests fit in a single chunk - (torch.tensor([1, 1, 1]), 10, 0, [(0, 3)]), - # Large buffer with non-zero start - (torch.tensor([4, 4, 4]), 100, 1, [(1, 3)]), + (torch.tensor([1, 1, 1]), 10, [(0, 3)]), + # Large buffer + (torch.tensor([4, 4, 4]), 100, [(0, 3)]), ], ) -def test_split_prefill_chunks(seq_lens, max_buf, start, expected): - out = split_prefill_chunks(seq_lens, max_buf, start) +def test_split_prefill_chunks(seq_lens, max_buf, expected): + out = split_prefill_chunks(seq_lens, max_buf) assert out == expected diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 6d862c5812560..52a58a082683d 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2403,6 +2403,29 @@ def cp_gather_cache( ) +def cp_gather_and_upconvert_fp8_kv_cache( + src_cache: torch.Tensor, + dst: torch.Tensor, + block_table: torch.Tensor, + seq_lens: torch.Tensor, + workspace_starts: torch.Tensor, + batch_size: int, +) -> None: + """Gather and upconvert FP8 KV cache to BF16 workspace. + + Args: + src_cache: FP8 KV cache [num_blocks, block_size, 656] + dst: BF16 output workspace [total_tokens, 576] + block_table: Block indices [num_reqs, max_blocks] + seq_lens: Sequence lengths [num_reqs] + workspace_starts: Workspace start offsets [num_reqs] + batch_size: Number of requests + """ + torch.ops._C_cache_ops.cp_gather_and_upconvert_fp8_kv_cache( + src_cache, dst, block_table, seq_lens, workspace_starts, batch_size + ) + + def indexer_k_quant_and_cache( k: torch.Tensor, kv_cache: torch.Tensor, diff --git a/vllm/envs.py b/vllm/envs.py index cb75ba1a62de9..d0f2798096263 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -239,6 +239,7 @@ if TYPE_CHECKING: VLLM_NCCL_INCLUDE_PATH: str | None = None VLLM_USE_FBGEMM: bool = False VLLM_GC_DEBUG: str = "" + VLLM_DEBUG_WORKSPACE: bool = False VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256 VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" @@ -1537,6 +1538,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger with # top 5 collected objects "VLLM_GC_DEBUG": lambda: os.getenv("VLLM_GC_DEBUG", ""), + # Debug workspace allocations. + # logging of workspace resize operations. + "VLLM_DEBUG_WORKSPACE": lambda: bool(int(os.getenv("VLLM_DEBUG_WORKSPACE", "0"))), # Disables parallel execution of shared_experts via separate cuda stream "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: bool( int(os.getenv("VLLM_DISABLE_SHARED_EXPERTS_STREAM", "0")) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 075610ec588ae..9e75a7c08070e 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -22,12 +22,12 @@ from vllm.model_executor.layers.fused_moe.utils import ( from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.v1.worker.ubatching import ( - dbo_current_ubatch_id, dbo_enabled, dbo_maybe_run_recv_hook, dbo_register_recv_hook, dbo_yield, ) +from vllm.v1.worker.workspace import current_workspace_manager logger = init_logger(__name__) @@ -661,25 +661,6 @@ def _slice_scales( return None -class SharedResizableBuffer: - def __init__(self): - self.buffer = None - - def get( - self, shape: tuple[int, ...], device: torch.device, dtype: torch.dtype - ) -> torch.Tensor: - assert shape != () - shape_numel = prod(shape) - if ( - self.buffer is None - or self.buffer.numel() < shape_numel - or self.buffer.device != device - or self.buffer.dtype != dtype - ): - self.buffer = torch.empty(shape_numel, device=device, dtype=dtype) - return self.buffer[:shape_numel].view(*shape) - - @final class FusedMoEModularKernel(torch.nn.Module): """ @@ -694,22 +675,6 @@ class FusedMoEModularKernel(torch.nn.Module): objects. """ - class SharedBuffers: - def __init__(self) -> None: - self.fused_out = SharedResizableBuffer() - self.workspace13 = SharedResizableBuffer() - self.workspace2 = SharedResizableBuffer() - - # Persistent buffers that are shared across `FusedMoEModularKernel` - # instances (layers), to save memory and allocattions. - # - # We have two sets of buffers to support dual batch overlap (DBO) where each - # microbatch (ubatch) should use its own set of buffers to avoid - # cross-ubatch contimination. - # NOTE that memory is lazily allocated for these buffers, meaning that if - # DBO isn't being used, the second SharedBuffers will be empty. - shared_buffers: list[SharedBuffers] = [SharedBuffers(), SharedBuffers()] - def __init__( self, prepare_finalize: FusedMoEPrepareAndFinalize, @@ -806,10 +771,6 @@ class FusedMoEModularKernel(torch.nn.Module): assert M_full > 0 and M_chunk > 0 num_chunks, _ = self._chunk_info(M_full) - - # select per-ubatch buffers to avoid cross-ubatch reuse under DBO - ubatch_idx = dbo_current_ubatch_id() - buffers = self.shared_buffers[ubatch_idx] workspace_dtype = self.fused_experts.workspace_dtype(out_dtype) # Force worst-case allocation in profiling run for @@ -832,14 +793,11 @@ class FusedMoEModularKernel(torch.nn.Module): expert_tokens_meta, ) ) - buffers.workspace13.get( - max_workspace_13, device=device, dtype=workspace_dtype - ) - buffers.workspace2.get( - max_workspace_2, device=device, dtype=workspace_dtype - ) - buffers.fused_out.get( - max_fused_out_shape, device=device, dtype=workspace_dtype + + current_workspace_manager().get_simultaneous( + (max_workspace_13, workspace_dtype), + (max_workspace_2, workspace_dtype), + (max_fused_out_shape, out_dtype), ) # Get intermediate workspace shapes based off the chunked M size. @@ -866,22 +824,23 @@ class FusedMoEModularKernel(torch.nn.Module): # We can reuse the memory between cache1 and cache3 because by the # time we need cache3, we're done with cache1. - workspace13 = buffers.workspace13.get( - workspace13_shape, device=device, dtype=workspace_dtype - ) - workspace2 = buffers.workspace2.get( - workspace2_shape, device=device, dtype=workspace_dtype - ) - # Construct the entire output that can then be processed in chunks. # Reuse workspace13 for the output in the non-chunked case as long # as it is large enough. This will not always be the case for standard # format experts and with experts that have empty workspaces. if num_chunks == 1 and prod(workspace13_shape) >= prod(fused_out_shape): + workspace13, workspace2 = current_workspace_manager().get_simultaneous( + (workspace13_shape, workspace_dtype), + (workspace2_shape, workspace_dtype), + ) fused_out = _resize_cache(workspace13, fused_out_shape) else: - fused_out = buffers.fused_out.get( - fused_out_shape, device=device, dtype=out_dtype + workspace13, workspace2, fused_out = ( + current_workspace_manager().get_simultaneous( + (workspace13_shape, workspace_dtype), + (workspace2_shape, workspace_dtype), + (fused_out_shape, out_dtype), + ) ) return workspace13, workspace2, fused_out diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index a9fa76deecbd2..146124153c79d 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -83,6 +83,7 @@ from vllm.v1.attention.backends.mla.indexer import ( DeepseekV32IndexerMetadata, ) from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec +from vllm.v1.worker.workspace import current_workspace_manager from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP from .utils import ( @@ -616,8 +617,15 @@ def sparse_attn_indexer( # careful! this will be None in dummy run attn_metadata = get_forward_context().attn_metadata fp8_dtype = current_platform.fp8_dtype() + # assert isinstance(attn_metadata, dict) if not isinstance(attn_metadata, dict): + # Reserve workspace for indexer during profiling run + current_workspace_manager().get_simultaneous( + ((total_seq_lens, head_dim), torch.float8_e4m3fn), + ((total_seq_lens, 4), torch.uint8), + ) + return sparse_attn_indexer_fake( hidden_states, k_cache_prefix, @@ -651,17 +659,17 @@ def sparse_attn_indexer( topk_indices_buffer[: hidden_states.shape[0]] = -1 if has_prefill: prefill_metadata = attn_metadata.prefill + + # Get the full shared workspace buffers once (will allocate on first use) + workspace_manager = current_workspace_manager() + k_fp8_full, k_scale_full = workspace_manager.get_simultaneous( + ((total_seq_lens, head_dim), fp8_dtype), + ((total_seq_lens, 4), torch.uint8), + ) + for chunk in prefill_metadata.chunks: - k_fp8 = torch.empty( - [chunk.total_seq_lens, head_dim], - device=k.device, - dtype=fp8_dtype, - ) - k_scale = torch.empty( - [chunk.total_seq_lens, 4], - device=k.device, - dtype=torch.uint8, - ) + k_fp8 = k_fp8_full[: chunk.total_seq_lens] + k_scale = k_scale_full[: chunk.total_seq_lens] ops.cp_gather_indexer_k_quant_cache( kv_cache, k_fp8, @@ -777,15 +785,6 @@ def sparse_attn_indexer_fake( total_seq_lens: int, topk_indices_buffer: torch.Tensor | None, ) -> torch.Tensor: - # profile run - # NOTE(Chen): create the max possible flattened_kv. So that - # profile_run can get correct memory usage. - _flattened_kv = torch.empty( - [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8 - ) - fp8_dtype = current_platform.fp8_dtype() - _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous() - _k_scale = _flattened_kv[..., head_dim:].view(torch.float32).contiguous() return topk_indices_buffer diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 1eee1d225293b..f3052fbaf2a65 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -18,7 +18,7 @@ from vllm.attention.ops.flashmla import ( flash_mla_with_kvcache, get_mla_metadata, ) -from vllm.config import VllmConfig +from vllm.config import VllmConfig, get_current_vllm_config from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.platforms import current_platform @@ -30,13 +30,31 @@ from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, + reshape_attn_output_for_spec_decode, + reshape_query_for_spec_decode, + split_decodes_and_prefills, + split_prefill_chunks, ) from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.worker.workspace import current_workspace_manager if TYPE_CHECKING: from vllm.model_executor.models.deepseek_v2 import Indexer logger = init_logger(__name__) + +# For FP8 sparse attention we have two impelementations: +# 1. Mixed batch mode: use the FP8 decode kernel for both prefill and decode this is +# done by treating all tokens as single batch. +# 2. Separate prefill and decode mode: use the BF16 prefill kernel for prefill +# (upconverting the FP8 cache to BF16 then calling the prefill kernel) and using +# the FP8 decode kernel for decode. +# Currently we use #1 when the number of heads per rank is low (i.e. TP) since the BF16 +# prefill kernel requires padding the numer of heads to 128 while the decode does not +# so when the per ranke head count is below MIN_HEADS_FOR_BF16_PREFILL we use the mixed +# batch mode (#2). +MIN_HEADS_FOR_BF16_PREFILL = 32 + """ NOTE: FlashMLA Sparse uses an fp8 cache with the following format @@ -127,19 +145,72 @@ class FlashMLASparseMetadata: dummy_block_table: torch.Tensor cache_lens: torch.Tensor - fp8_extra_metadata: FP8KernelMetadata | None = None + @dataclass + class FP8SeperatePrefillDecode: + @dataclass + class Decode: + kernel_metadata: "FlashMLASparseMetadata.FP8KernelMetadata" + decode_query_len: int # needed for reshape in spec decode + + @dataclass + class Prefill: + # Sequence lengths (context + query) for prefill requests + # Shape: [num_prefill_reqs] + seq_lens: torch.Tensor + + # Request ID for each token: -1 for decode tokens, request index + # (0, 1, 2, ...) for prefill tokens. + # Shape: [num_actual_tokens] + request_ids: torch.Tensor + + # Workspace start offsets for all prefill requests + # Shape: [num_prefill_reqs], adjusted in-place per chunk to be + # 0-indexed within each chunk. Used to map prefill tokens to workspace + # offsets in convert_logical_index_to_physical_index + workspace_starts: torch.Tensor + + @dataclass + class Chunk: + """Metadata for a chunk of prefill requests. + + Prefill requests may be chunked to fit within the fixed workspace size. + """ + + seq_lens: torch.Tensor + tokens_slice: slice + block_table: torch.Tensor + req_start_idx: int + workspace_starts: torch.Tensor + chunk_tot_seqlen: int + + chunks: list[Chunk] + + num_prefills: int = 0 + num_decodes: int = 0 + num_prefill_tokens: int = 0 + num_decode_tokens: int = 0 + + decode: Decode | None = None + prefill: Prefill | None = None + + fp8_extra_metadata: FP8SeperatePrefillDecode | FP8KernelMetadata | None = None + fp8_use_mixed_batch: bool = False +# Kernel with prefill workspace support @triton.jit def _convert_req_index_to_global_index_kernel( req_id_ptr, # int32 [num_tokens] block_table_ptr, # int32 [num_requests, max_num_blocks_per_req] token_indices_ptr, # int32 [num_tokens, NUM_TOPK_TOKENS] out_ptr, # int32 [num_tokens, NUM_TOPK_TOKENS] + prefill_request_id_ptr, # int32 [num_tokens], -1 for decode, >=0 for prefill + workspace_starts_ptr, # int32 [num_prefill_reqs+1] or nullptr # shapes (compile-time where possible) max_num_blocks_per_req: tl.constexpr, BLOCK_SIZE: tl.constexpr, BLOCK_N: tl.constexpr, # tile width along columns + HAS_PREFILL: tl.constexpr, # strides (in elements) bt_stride0, bt_stride1, @@ -165,7 +236,10 @@ def _convert_req_index_to_global_index_kernel( # Only token == -1 should propagate as -1 is_invalid_tok = tok < 0 - + is_prefill = False + if HAS_PREFILL: + prefill_req_id = tl.load(prefill_request_id_ptr + token_id) + is_prefill = prefill_req_id >= 0 # Compute block id and in-block offset block_id = tok // BLOCK_SIZE inblock_off = tok % BLOCK_SIZE @@ -173,12 +247,18 @@ def _convert_req_index_to_global_index_kernel( # Guard block_table access valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0) bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1 - base = tl.load(bt_ptr, mask=valid_block, other=0) + is_invalid_tok |= ~valid_block + base = tl.load(bt_ptr, mask=valid_block & ~is_prefill, other=0) + out_val = base * BLOCK_SIZE + inblock_off - # If token == -1 OR block_id OOB, output -1; else base * BLOCK_SIZE + offset - out_val = tl.where( - is_invalid_tok | (~valid_block), -1, base * BLOCK_SIZE + inblock_off - ) + # Override with prefill output if prefill is enabled + if HAS_PREFILL: + workspace_start = tl.load( + workspace_starts_ptr + prefill_req_id, mask=is_prefill, other=0 + ) + prefill_out = workspace_start + tok + out_val = tl.where(is_prefill, prefill_out, out_val) + out_val = tl.where(is_invalid_tok, -1, out_val) # Store results out_ptr_ij = out_ptr + token_id * out_stride0 + indice_id * out_stride1 @@ -192,6 +272,9 @@ def triton_convert_req_index_to_global_index( BLOCK_SIZE: int = 64, NUM_TOPK_TOKENS: int = 2048, BLOCK_N: int = 128, # tile width along columns + HAS_PREFILL_WORKSPACE: bool = False, + prefill_workspace_request_ids: torch.Tensor | None = None, + prefill_workspace_starts: torch.Tensor | None = None, ): """ out[token_id, indice_id] = @@ -202,17 +285,32 @@ def triton_convert_req_index_to_global_index( Only when token_indices[token_id, indice_id] == -1 do we output -1. For safety, we also output -1 if the derived block_id would be out-of-bounds. + + When HAS_PREFILL_WORKSPACE is True, prefill tokens are mapped to workspace offsets + instead of global cache slots. prefill_workspace_request_ids and + prefill_workspace_starts must be provided. + + prefill_workspace_request_ids: int32 [num_tokens], -1 for decode else + prefill request index (maps to prefill_workspace_starts) + prefill_workspace_starts: int32 [num_prefills], 0-indexed workspace + starts for each prefill request """ assert req_id.dtype == torch.int32 assert block_table.dtype == torch.int32 assert token_indices.dtype == torch.int32 assert token_indices.shape[1] == NUM_TOPK_TOKENS assert NUM_TOPK_TOKENS % BLOCK_N == 0, ( - f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible byBLOCK_N ({BLOCK_N})" + f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible by BLOCK_N ({BLOCK_N})" ) + if HAS_PREFILL_WORKSPACE: + assert prefill_workspace_request_ids is not None + assert prefill_workspace_starts is not None + assert prefill_workspace_request_ids.dtype == torch.int32 + assert prefill_workspace_starts.dtype == torch.int32 + num_tokens = req_id.shape[0] - num_requests, max_num_blocks_per_req = block_table.shape + max_num_blocks_per_req = block_table.shape[1] tiles_per_row = NUM_TOPK_TOKENS // BLOCK_N # Ensure contiguous tensors on the same device @@ -226,6 +324,13 @@ def triton_convert_req_index_to_global_index( ti_stride0, ti_stride1 = token_indices_c.stride() out_stride0, out_stride1 = out.stride() + # Prepare prefill pointers + if HAS_PREFILL_WORKSPACE: + assert prefill_workspace_request_ids is not None # for mypy + assert prefill_workspace_starts is not None # for mypy + assert prefill_workspace_request_ids.is_contiguous() + assert prefill_workspace_starts.is_contiguous() + # Exact 2D grid: tokens × column tiles grid = (num_tokens, tiles_per_row) @@ -234,10 +339,13 @@ def triton_convert_req_index_to_global_index( block_table_c, token_indices_c, out, + prefill_workspace_request_ids, + prefill_workspace_starts, # shapes / constexprs max_num_blocks_per_req, BLOCK_SIZE, BLOCK_N, + HAS_PREFILL_WORKSPACE, # strides bt_stride0, bt_stride1, @@ -249,7 +357,16 @@ def triton_convert_req_index_to_global_index( return out -@dataclass +def get_prefill_workspace_size(max_model_len: int): + # NOTE(Lucas): 5 is a magic number for controlling the prefill buffer size. + # May be tuned later. + # Memory usage: 5 * max_model_len * 576 * 2 bytes + # Example: DeepSeek-V3.2 with max_model_len=163840 -> + # 5 * 163840 * 576 * 2 = ~900 MB + # This fits nicely below the typical MoE workspace size of >2GB so this is "free" + return max_model_len * 5 + + class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetadata]): _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH @@ -259,29 +376,42 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad layer_names: list[str], vllm_config: VllmConfig, device: torch.device, - ): + ) -> None: + self.vllm_config = vllm_config + self.layer_names = layer_names cache_config = vllm_config.cache_config self.kv_cache_spec = kv_cache_spec self.model_config = vllm_config.model_config parallel_config = vllm_config.parallel_config self.device = device + # Treat requests with query length <= 1 as decodes to match the + # DeepGEMM indexer constraint (fp8_paged_mqa_logits only supports next_n <= 2) + self._init_reorder_batch_threshold(1, supports_spec_as_decode=True) + props = torch.cuda.get_device_properties(device) sm_count = props.multi_processor_count self.num_heads = self.model_config.get_num_attention_heads(parallel_config) self.mla_dims = get_mla_dims(self.model_config) + self.topk_tokens = vllm_config.model_config.hf_config.index_topk self.use_fp8_kv_cache = cache_config.cache_dtype == "fp8_ds_mla" - self.topk_tokens_tensor = torch.tensor( - [self.topk_tokens], device=device, dtype=torch.int32 + max_num_seqs = vllm_config.scheduler_config.max_num_seqs + # Shape: [max_num_seqs], all elements = topk_tokens (constant for full-CG) + self.topk_tokens_tensor = torch.full( + (max_num_seqs,), self.topk_tokens, device=device, dtype=torch.int32 ) - self.max_model_len_tensor = torch.tensor( - [self.model_config.max_model_len], device=device, dtype=torch.int32 + # Shape: [max_num_seqs], all elements = max_model_len + self.max_model_len_tensor = torch.full( + (max_num_seqs,), + self.model_config.max_model_len, + device=device, + dtype=torch.int32, ) # this is ignored by `flash_mla_with_kvcache` if indices not None self.dummy_block_table = torch.empty( - (1, 1), dtype=torch.int32, device=self.device + (max_num_seqs, 1), dtype=torch.int32, device=self.device ) # Equation taken from FlashMLA/csrc/pybind.cpp @@ -299,10 +429,9 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad dtype=torch.int32, device=device, ) + # Sized for per-request batching (num_decodes + 1) self.num_splits_buffer = torch.empty( - # We pack all the tokens into one batch for sparse attention. - # Otherwise, we can exceed the sm of `get_mla_metadata`. - (2,), + (max_num_seqs + 1,), dtype=torch.int32, device=device, ) @@ -312,30 +441,171 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad device=device, ) - def build( + def _build_fp8_mixed_decode_prefill( self, - common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata, - fast_build: bool = False, - ) -> FlashMLASparseMetadata: - num_tokens = common_attn_metadata.num_actual_tokens - starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32) - seg_lengths = np.diff(starts) - req_id_per_token = np.repeat( - np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths - ) - # Zero-fill for cudagraphs - self.req_id_per_token_buffer.fill_(0) - self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_( - torch.from_numpy(req_id_per_token), non_blocking=True - ) - req_id_per_token = self.req_id_per_token_buffer[:num_tokens] + ) -> "FlashMLASparseMetadata.FP8KernelMetadata": + """Build FP8 metadata treating all tokens as one mixed batch. + + This matches main branch's approach and avoids the BF16 prefill kernel + which has head padding overhead when num_heads is small (high TP case). + """ + num_tokens = common_attn_metadata.num_actual_tokens + + # Build metadata for all tokens as a single batch + tile_scheduler_metadata, num_splits = get_mla_metadata( + cache_seqlens=self.topk_tokens_tensor[:1], # Single batch + num_q_tokens_per_head_k=num_tokens * self.num_heads, + topk=self.topk_tokens, + num_heads_q=self.num_heads, + num_heads_k=1, + is_fp8_kvcache=True, + ) + + num_sm_parts = tile_scheduler_metadata.size(0) + tile_scheduler_metadata_buffer = self.tile_scheduler_metadata_buffer[ + :num_sm_parts + ] + tile_scheduler_metadata_buffer.copy_(tile_scheduler_metadata) + num_splits_view = self.num_splits_buffer[:2] + num_splits_view.copy_(num_splits) + + fp8_metadata = FlashMLASparseMetadata.FP8KernelMetadata( + scheduler_metadata=tile_scheduler_metadata_buffer, + num_splits=num_splits_view, + cache_lens=self.max_model_len_tensor[:1], + dummy_block_table=self.dummy_block_table[:1], + ) + + return fp8_metadata + + def _build_fp8_separate_prefill_decode( + self, + common_attn_metadata: CommonAttentionMetadata, + ) -> "FlashMLASparseMetadata.FP8SeperatePrefillDecode": + num_tokens = common_attn_metadata.num_actual_tokens + + (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens) = ( + split_decodes_and_prefills( + common_attn_metadata, + decode_threshold=self.reorder_batch_threshold or 1, + require_uniform=True, + ) + ) + + FP8Meta = FlashMLASparseMetadata.FP8SeperatePrefillDecode + fp8_metadata = FP8Meta( + num_decodes=num_decodes, + num_prefills=num_prefills, + num_decode_tokens=num_decode_tokens, + num_prefill_tokens=num_prefill_tokens, + ) + + # Extract prefill sequence lengths (context + query, not just query) + # Decode requests come first in the batch, prefill requests follow + prefill_seq_lens = None + prefill_request_id = None + prefill_workspace_starts = None + prefill_chunks = None + + # For pure decode batches, prefill_request_id will be None + # For mixed batches, it will have -1 for decode and request_id for prefill + if num_prefills > 0: + seq_lens_cpu = common_attn_metadata.seq_lens_cpu + seq_lens = common_attn_metadata.seq_lens + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + + prefill_seq_lens_cpu = seq_lens_cpu[num_decodes:] + prefill_seq_lens = seq_lens[num_decodes:] + + # Build prefill_request_id: -1 for decode, request index for + # prefill. This enables a single + # convert_logical_index_to_physical_index call for all tokens + prefill_request_id = torch.full( + (num_tokens,), -1, dtype=torch.int32, device=self.device + ) + # Map prefill tokens to their request IDs (0, 1, 2, ...) + for req_idx in range(num_prefills): + # Get query token range for this prefill request + global_req_idx = num_decodes + req_idx + req_query_start = query_start_loc_cpu[global_req_idx] + req_query_end = query_start_loc_cpu[global_req_idx + 1] + prefill_request_id[req_query_start:req_query_end] = req_idx + + # will be adjusted by chunk loop + prefill_workspace_starts_cpu = torch.zeros( + num_prefills, dtype=torch.int32, pin_memory=True + ) + prefill_workspace_starts_cpu[1:] = torch.cumsum( + prefill_seq_lens_cpu[:-1], dim=0 + ) + # populated by non-blocking copy after prefill_workspace_starts_cpu is + # updated by each chunk + prefill_workspace_starts = torch.empty( + num_prefills, dtype=torch.int32, device=self.device + ) + + # Chunk prefill requests to fit within workspace size + max_prefill_buffer_size = get_prefill_workspace_size( + self.vllm_config.model_config.max_model_len + ) + chunk_bounds = split_prefill_chunks( + prefill_seq_lens_cpu, max_prefill_buffer_size + ) + + prefill_chunks = [] + for chunk_start, chunk_end in chunk_bounds: + # Adjust workspace_starts in-place per chunk to be + # 0-indexed within each chunk + # Example: seq_lens=[10,15,20,5], chunks=[[0,2],[2,4]] + # Initial: workspace_starts=[0,10,25,45] + # After: workspace_starts=[0,10,0,20] + # (chunk 0 starts at 0, chunk 1 starts at 0) + offset = prefill_workspace_starts_cpu[chunk_start].item() + prefill_workspace_starts_cpu[chunk_start:chunk_end] -= offset + + chunk_seq_lens = prefill_seq_lens[chunk_start:chunk_end] + chunk_tot_seqlen = prefill_seq_lens_cpu[chunk_start:chunk_end].sum() + token_start = query_start_loc_cpu[num_decodes + chunk_start].item() + token_end = query_start_loc_cpu[num_decodes + chunk_end].item() + tokens_slice = slice(token_start, token_end) + + # Create chunk view of gpu tensor + chunk_workspace_starts = prefill_workspace_starts[chunk_start:chunk_end] + chunk_block_table = common_attn_metadata.block_table_tensor[ + num_decodes + chunk_start : num_decodes + chunk_end + ] + + prefill_chunks.append( + FP8Meta.Prefill.Chunk( + seq_lens=chunk_seq_lens, + tokens_slice=tokens_slice, + block_table=chunk_block_table, + req_start_idx=chunk_start, + workspace_starts=chunk_workspace_starts, + chunk_tot_seqlen=chunk_tot_seqlen, + ) + ) + + prefill_workspace_starts.copy_( + prefill_workspace_starts_cpu, non_blocking=True + ) + + fp8_metadata.prefill = FP8Meta.Prefill( + seq_lens=prefill_seq_lens, + request_ids=prefill_request_id, + workspace_starts=prefill_workspace_starts, + chunks=prefill_chunks, + ) + + if num_decodes > 0: + # Compute decode_query_len for spec decode (uniform due to require_uniform) + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + decode_query_len = (query_start_loc_cpu[1] - query_start_loc_cpu[0]).item() - fp8_extra_metadata = None - if self.use_fp8_kv_cache: tile_scheduler_metadata, num_splits = get_mla_metadata( - cache_seqlens=self.topk_tokens_tensor, - num_q_tokens_per_head_k=num_tokens * self.num_heads, + cache_seqlens=self.topk_tokens_tensor[:num_decodes], + num_q_tokens_per_head_k=decode_query_len * self.num_heads, topk=self.topk_tokens, num_heads_q=self.num_heads, num_heads_k=1, @@ -348,33 +618,70 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad :num_sm_parts ] tile_scheduler_metadata_buffer.copy_(tile_scheduler_metadata) - self.num_splits_buffer.copy_(num_splits) + # num_splits has size [num_decodes + 1] + num_splits_view = self.num_splits_buffer[: num_decodes + 1] + num_splits_view.copy_(num_splits) - fp8_extra_metadata = FlashMLASparseMetadata.FP8KernelMetadata( + kernel_meta = FlashMLASparseMetadata.FP8KernelMetadata( scheduler_metadata=tile_scheduler_metadata_buffer, - num_splits=self.num_splits_buffer, - # cache_lens and block_table are basically unused in sparse case - # but the decode kernel will treat -1 and indices >= cache_lens - # as invalid so we make sure cache_lens is large enough to not - # accidentally mark indices invalid, we will use -1 exclusively - # to mark invalid indices - cache_lens=self.max_model_len_tensor, - dummy_block_table=self.dummy_block_table, + num_splits=num_splits_view, + dummy_block_table=self.dummy_block_table[:num_decodes], + cache_lens=self.max_model_len_tensor[:num_decodes], + ) + fp8_metadata.decode = FP8Meta.Decode( + kernel_metadata=kernel_meta, + decode_query_len=decode_query_len, ) + return fp8_metadata + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> FlashMLASparseMetadata: + cm = common_attn_metadata + num_tokens = cm.num_actual_tokens + starts = np.asarray(cm.query_start_loc_cpu, dtype=np.int32) + seg_lengths = np.diff(starts) + req_id_per_token = np.repeat( + np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths + ) + # Zero-fill for cudagraphs + self.req_id_per_token_buffer.fill_(0) + self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_( + torch.from_numpy(req_id_per_token), non_blocking=True + ) + req_id_per_token = self.req_id_per_token_buffer[:num_tokens] + + fp8_extra_metadata: ( + FlashMLASparseMetadata.FP8SeperatePrefillDecode + | FlashMLASparseMetadata.FP8KernelMetadata + | None + ) = None + fp8_use_mixed_batch = self.num_heads < MIN_HEADS_FOR_BF16_PREFILL + if self.use_fp8_kv_cache: + if fp8_use_mixed_batch: + fp8_extra_metadata = self._build_fp8_mixed_decode_prefill(cm) + else: + fp8_extra_metadata = self._build_fp8_separate_prefill_decode(cm) + metadata = FlashMLASparseMetadata( - num_reqs=common_attn_metadata.num_reqs, - max_query_len=common_attn_metadata.max_query_len, - max_seq_len=common_attn_metadata.max_seq_len, - num_actual_tokens=common_attn_metadata.num_actual_tokens, - query_start_loc=common_attn_metadata.query_start_loc, - slot_mapping=common_attn_metadata.slot_mapping, - block_table=common_attn_metadata.block_table_tensor, + num_reqs=cm.num_reqs, + max_query_len=cm.max_query_len, + max_seq_len=cm.max_seq_len, + num_actual_tokens=cm.num_actual_tokens, + query_start_loc=cm.query_start_loc, + slot_mapping=cm.slot_mapping, + block_table=cm.block_table_tensor, req_id_per_token=req_id_per_token, block_size=self.kv_cache_spec.block_size, topk_tokens=self.topk_tokens, fp8_extra_metadata=fp8_extra_metadata, + fp8_use_mixed_batch=fp8_use_mixed_batch, ) + return metadata @@ -414,12 +721,204 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): self.topk_indices_buffer = indexer.topk_indices_buffer self.padding = 128 if current_platform.is_device_capability(100) else 64 + if kv_cache_dtype == "fp8_ds_mla": + # Reserve workspace during initialization + vllm_config = get_current_vllm_config() + assert vllm_config is not None and vllm_config.model_config is not None + prefill_workspace_size = get_prefill_workspace_size( + vllm_config.model_config.max_model_len + ) + self.prefill_workspace_shape = (prefill_workspace_size, head_size) + (self.prefill_bf16_workspace,) = ( + current_workspace_manager().get_simultaneous( + (self.prefill_workspace_shape, torch.bfloat16) + ) + ) + def _forward_bf16_kv( self, q: torch.Tensor, kv_c_and_k_pe_cache: torch.Tensor, topk_indices: torch.Tensor, attn_metadata: FlashMLASparseMetadata, + ) -> torch.Tensor: + # Convert per-request indices to global slots (decode) or workspace + # offsets (prefill). + topk_indices = triton_convert_req_index_to_global_index( + attn_metadata.req_id_per_token, + attn_metadata.block_table, + topk_indices, + BLOCK_SIZE=attn_metadata.block_size, + NUM_TOPK_TOKENS=topk_indices.shape[1], + ) + + return self._bf16_flash_mla_kernel(q, kv_c_and_k_pe_cache, topk_indices) + + def _forward_fp8_kv_separate_prefill_decode( + self, + q: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + topk_indices: torch.Tensor, + attn_metadata: FlashMLASparseMetadata, + ) -> torch.Tensor: + fp8_metadata = attn_metadata.fp8_extra_metadata + assert isinstance(fp8_metadata, FlashMLASparseMetadata.FP8SeperatePrefillDecode) + num_decodes = fp8_metadata.num_decodes + + prefill_request_ids = None + prefill_workspace_starts = None + has_prefill_workspace = False + if fp8_metadata.prefill is not None: + prefill_request_ids = fp8_metadata.prefill.request_ids + prefill_workspace_starts = fp8_metadata.prefill.workspace_starts + has_prefill_workspace = True + + # Convert per-request indices to global slots (decode) or workspace + # offsets (prefill). + # For FP8 cache: prefill uses workspace mapping (upconverted to BF16) + # For BF16 cache: always use global cache slots (no workspace) + # prefill_workspace_starts has been adjusted in-place per chunk so + # prefill indices automatically come out chunk-local + topk_indices = triton_convert_req_index_to_global_index( + attn_metadata.req_id_per_token, + attn_metadata.block_table, + topk_indices, + BLOCK_SIZE=attn_metadata.block_size, + NUM_TOPK_TOKENS=topk_indices.shape[1], + HAS_PREFILL_WORKSPACE=has_prefill_workspace, + prefill_workspace_request_ids=prefill_request_ids, + prefill_workspace_starts=prefill_workspace_starts, + ) + + fp8_metadata = attn_metadata.fp8_extra_metadata + assert isinstance(fp8_metadata, FlashMLASparseMetadata.FP8SeperatePrefillDecode) + + def _fp8_decode(q: torch.Tensor, topk_indices: torch.Tensor) -> torch.Tensor: + # Reshape q: (num_decode_tokens, num_heads, head_dim) + # -> (num_decodes, seq_len, num_heads, head_dim) + q = reshape_query_for_spec_decode(q, num_decodes) + seq_len = q.shape[1] + # Reshape topk_indices: (num_decode_tokens, topk) + # -> (num_decodes, seq_len, topk) + topk_indices = topk_indices.view(num_decodes, seq_len, -1) + assert fp8_metadata.decode is not None + attn_out, _ = self._fp8_flash_mla_kernel( + q=q, + kv_c_and_k_pe_cache=kv_c_and_k_pe_cache, + topk_indices=topk_indices, + kernel_metadata=fp8_metadata.decode.kernel_metadata, + ) + # Reshape output: (num_decodes, seq_len, num_heads, head_dim_v) + # -> (num_decode_tokens, num_heads, head_dim_v) + return reshape_attn_output_for_spec_decode(attn_out) + + num_decode_tokens = fp8_metadata.num_decode_tokens + num_prefill_tokens = fp8_metadata.num_prefill_tokens + + # Pure decode: direct call without allocation + if num_decode_tokens > 0 and num_prefill_tokens == 0: + assert fp8_metadata.decode is not None + attn_out = _fp8_decode(q, topk_indices) + else: + # Mixed or pure prefill: allocate output tensor + attn_out = q.new_empty( + (attn_metadata.num_actual_tokens, self.num_heads, self.kv_lora_rank), + dtype=q.dtype, + device=q.device, + ) + + if num_decode_tokens > 0: + attn_out[:num_decode_tokens] = _fp8_decode( + q[:num_decode_tokens], topk_indices[:num_decode_tokens] + ) + + assert fp8_metadata.prefill is not None + for chunk in fp8_metadata.prefill.chunks: + chunk_workspace = self.prefill_bf16_workspace[: chunk.chunk_tot_seqlen] + ops.cp_gather_and_upconvert_fp8_kv_cache( + kv_c_and_k_pe_cache, + chunk_workspace, + chunk.block_table, + chunk.seq_lens, + chunk.workspace_starts, + len(chunk.block_table), + ) + + chunk_q = q[chunk.tokens_slice] + chunk_topk_indices_workspace = topk_indices[chunk.tokens_slice] + + attn_out[chunk.tokens_slice] = self._bf16_flash_mla_kernel( + chunk_q, + chunk_workspace, + chunk_topk_indices_workspace, + ) + + return attn_out + + def _forward_fp8_kv_mixed_batch( + self, + q: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + topk_indices: torch.Tensor, + attn_metadata: FlashMLASparseMetadata, + ) -> torch.Tensor: + """Mixed batch FP8 forward path that treats all tokens as one batch. + + This is equivalent to main branch's approach and avoids the BF16 + prefill kernel which has head padding overhead when num_heads is small. + Used when use_mixed_batch is True. + """ + # Convert per-request indices to global slots (decode) or workspace + # offsets (prefill). + topk_indices = triton_convert_req_index_to_global_index( + attn_metadata.req_id_per_token, + attn_metadata.block_table, + topk_indices, + BLOCK_SIZE=attn_metadata.block_size, + NUM_TOPK_TOKENS=topk_indices.shape[1], + ) + + assert attn_metadata.fp8_extra_metadata is not None + assert isinstance( + attn_metadata.fp8_extra_metadata, FlashMLASparseMetadata.FP8KernelMetadata + ) + fp8_metadata = attn_metadata.fp8_extra_metadata + + _attn_out, _ = self._fp8_flash_mla_kernel( + q=q.unsqueeze(0), # unsqueeze to add batch_dim: (T, H, D) -> (1, T, H, D) + kv_c_and_k_pe_cache=kv_c_and_k_pe_cache, + topk_indices=topk_indices.unsqueeze(0), # (T, topk) -> (1, T, topk) + kernel_metadata=fp8_metadata, + ) + + # Output is (1, T, H, D_v), squeeze back to (T, H, D_v) + return _attn_out.squeeze(0) + + def _fp8_flash_mla_kernel( + self, + q: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + topk_indices: torch.Tensor, + kernel_metadata: FlashMLASparseMetadata.FP8KernelMetadata, + ) -> torch.Tensor: + return flash_mla_with_kvcache( + q=q, + k_cache=kv_c_and_k_pe_cache.view(torch.uint8).unsqueeze(-2), + block_table=kernel_metadata.dummy_block_table, + head_dim_v=512, + cache_seqlens=kernel_metadata.cache_lens, + tile_scheduler_metadata=kernel_metadata.scheduler_metadata, + num_splits=kernel_metadata.num_splits, + is_fp8_kvcache=True, + indices=topk_indices, + softmax_scale=self.softmax_scale, + ) + + def _bf16_flash_mla_kernel( + self, + q: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + topk_indices: torch.Tensor, ) -> torch.Tensor: num_tokens = q.shape[0] kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view( @@ -445,31 +944,6 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): output = output[:, : self.num_heads, :] return output - def _forward_fp8_kv( - self, - q: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, - topk_indices: torch.Tensor, - attn_metadata: FlashMLASparseMetadata, - ) -> torch.Tensor: - assert attn_metadata.fp8_extra_metadata is not None - extra_metadata = attn_metadata.fp8_extra_metadata - - _attn_out, _ = flash_mla_with_kvcache( - q=q.unsqueeze(0), # unsqueeze to add batch_dim - k_cache=kv_c_and_k_pe_cache.view(torch.uint8).unsqueeze(-2), - block_table=extra_metadata.dummy_block_table, - head_dim_v=512, - cache_seqlens=extra_metadata.cache_lens, - tile_scheduler_metadata=extra_metadata.scheduler_metadata, - num_splits=extra_metadata.num_splits, - is_fp8_kvcache=True, - indices=topk_indices.unsqueeze(0), # unsqueeze to add batch_dim - softmax_scale=self.softmax_scale, - ) - - return _attn_out - def forward( self, layer: AttentionLayer, @@ -477,7 +951,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): k_c_normed: torch.Tensor, # key in unified attn k_pe: torch.Tensor, # value in unified attn kv_cache: torch.Tensor, - attn_metadata: FlashMLASparseMetadata, + attn_metadata: FlashMLASparseMetadata | None, output: torch.Tensor | None = None, output_scale: torch.Tensor | None = None, output_block_scale: torch.Tensor | None = None, @@ -493,6 +967,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): ) if attn_metadata is None: + # Dummy run - no need to allocate buffers # The zero fill is required when used with DP + EP # to ensure all ranks within a DP group compute the # same expert outputs. @@ -505,6 +980,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): q = q[:num_actual_toks, ...] k_c_normed = k_c_normed[:num_actual_toks, ...] k_pe = k_pe[:num_actual_toks, ...] + topk_indices = self.topk_indices_buffer[:num_actual_toks] q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) # Convert from (B, N, P) to (N, B, P) @@ -514,16 +990,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): # Convert from (N, B, L) to (B, N, L) ql_nope = ql_nope.transpose(0, 1) - topk_indices = self.topk_indices_buffer[:num_actual_toks] - - # TODO: handle index / kv_cache correctly - topk_indices_global = triton_convert_req_index_to_global_index( - attn_metadata.req_id_per_token, - attn_metadata.block_table, - topk_indices, - BLOCK_SIZE=attn_metadata.block_size, - NUM_TOPK_TOKENS=attn_metadata.topk_tokens, - ) + use_fp8_cache = self.kv_cache_dtype == "fp8_ds_mla" q = torch.cat([ql_nope, q_pe], dim=-1) @@ -538,13 +1005,15 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): scale=layer._k_scale, ) - if self.kv_cache_dtype != "fp8_ds_mla": - attn_out = self._forward_bf16_kv( - q, kv_cache, topk_indices_global, attn_metadata + if not use_fp8_cache: + attn_out = self._forward_bf16_kv(q, kv_cache, topk_indices, attn_metadata) + elif attn_metadata.fp8_use_mixed_batch: + attn_out = self._forward_fp8_kv_mixed_batch( + q, kv_cache, topk_indices, attn_metadata ) else: - attn_out = self._forward_fp8_kv( - q, kv_cache, topk_indices_global, attn_metadata + attn_out = self._forward_fp8_kv_separate_prefill_decode( + q, kv_cache, topk_indices, attn_metadata ) self._v_up_proj(attn_out, out=output[:num_actual_toks]) diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index 77f1ba00d5b04..d0696f60a08c7 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -18,6 +18,7 @@ from vllm.v1.attention.backends.utils import ( AttentionMetadataBuilder, CommonAttentionMetadata, split_decodes_and_prefills, + split_prefill_chunks, ) logger = init_logger(__name__) @@ -176,40 +177,15 @@ def kv_spans_from_batches( def get_max_prefill_buffer_size(vllm_config: VllmConfig): max_model_len = vllm_config.model_config.max_model_len - # NOTE(Chen): 2 is a magic number for controlling the prefill buffer size. - # May be tuned later. - return max_model_len * 2 - - -def split_prefill_chunks( - seq_lens_cpu: torch.Tensor, max_prefill_buffer_size: int, reqs_start: int -) -> list[tuple[int, int]]: - """ - Split the prefill chunks into a list of tuples of (reqs_start, reqs_end) - such that the total sequence length of each chunk is less than the - maximum prefill buffer size. - - Args: - seq_lens_cpu: The sequence lengths of the prefill requests. - max_prefill_buffer_size: The maximum prefill buffer size. - reqs_start: The start index of the prefill requests. - - Returns: - A list of tuples of (reqs_start, reqs_end). - """ - chunk_seq_ids = [] - total_seq_lens = 0 - for i in range(reqs_start, len(seq_lens_cpu)): - cur_seq_len = seq_lens_cpu[i].item() - assert cur_seq_len <= max_prefill_buffer_size - total_seq_lens += cur_seq_len - if total_seq_lens > max_prefill_buffer_size: - chunk_seq_ids.append((reqs_start, i)) - reqs_start = i - total_seq_lens = cur_seq_len - if total_seq_lens > 0: - chunk_seq_ids.append((reqs_start, len(seq_lens_cpu))) - return chunk_seq_ids + # NOTE(Chen): 40 is a magic number for controlling the prefill buffer size. + # Each entry is 128 fp8 bytes and 4 scale bytes for a total of 132 bytes. + # The flashmla_sparse backend uses a workspace size of 5 * max_model_len. + # The memory usage of the workspace there is 576 * 2 bytes; so we size this as + # (576 * 2 // 132) * 5 = 40 to maximize this workspace size while still fitting + # within the flashmla_sparse workspace. + # For DeepSeek-V3.2, the max_model_len is 163840. + # 40 * 163840 * 132 = 865075200 bytes = 825 MB + return max_model_len * 40 class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder): @@ -302,9 +278,9 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder): prefill_metadata = None if num_prefills > 0: chunk_seq_ids = split_prefill_chunks( - common_attn_metadata.seq_lens_cpu, + common_attn_metadata.seq_lens_cpu[num_decodes:], self.max_prefill_buffer_size, - num_decodes, + request_offset=num_decodes, ) chunks = [ self.build_one_prefill_chunk( diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 79a1f7d4757d9..da43d87038234 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -937,6 +937,33 @@ def split_decodes_and_prefills( return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens) +def split_prefill_chunks( + seq_lens_cpu: torch.Tensor, workspace_size: int, request_offset: int = 0 +) -> list[tuple[int, int]]: + """ + Split the prefill requests into chunks such that the total sequence length + of each chunk is less than or equal to the workspace size. + + Args: + seq_lens_cpu: The sequence lengths of the prefill requests on CPU. + workspace_size: The maximum workspace size (in tokens) per chunk. + request_offset: The offset to add to the request indices. + Returns: + A list of tuples of (reqs_start, reqs_end) representing chunk boundaries. + """ + chunk_bounds = [] + i, n = 0, len(seq_lens_cpu) + assert torch.all(seq_lens_cpu <= workspace_size).item() + + while i < n: + start, chunk_total = i, 0 + while i < n and (chunk_total + (s := seq_lens_cpu[i].item())) <= workspace_size: + chunk_total += s + i += 1 + chunk_bounds.append((start + request_offset, i + request_offset)) + return chunk_bounds + + def reorder_batch_to_split_decodes_and_prefills( input_batch: "InputBatch", scheduler_output: "SchedulerOutput", diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3f20296c27ba7..978224faae65e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -162,6 +162,7 @@ from vllm.v1.worker.ubatch_utils import ( maybe_create_ubatch_slices, ) from vllm.v1.worker.utils import is_residual_scattered_for_sp +from vllm.v1.worker.workspace import lock_workspace from .utils import ( AttentionGroup, @@ -297,6 +298,7 @@ class GPUModelRunner( self.device = device self.pin_memory = is_pin_memory_available() self.dtype = self.model_config.dtype + self.kv_cache_dtype = kv_cache_dtype_str_to_dtype( cache_config.cache_dtype, self.model_config ) @@ -4597,6 +4599,10 @@ class GPUModelRunner( # after here. set_cudagraph_capturing_enabled(False) + # Lock workspace to prevent resizing during execution. + # Max workspace sizes should have been captured during warmup/profiling. + lock_workspace() + end_time = time.perf_counter() elapsed_time = end_time - start_time cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 25ac5aaf99818..21a8564f83c40 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -54,6 +54,7 @@ from vllm.v1.outputs import ( from vllm.v1.utils import report_usage_stats from vllm.v1.worker.utils import is_residual_scattered_for_sp from vllm.v1.worker.worker_base import WorkerBase +from vllm.v1.worker.workspace import init_workspace_manager logger = init_logger(__name__) @@ -255,6 +256,10 @@ class Worker(WorkerBase): else: raise RuntimeError(f"Not support device type: {self.device_config.device}") + # Initialize workspace manager + num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1 + init_workspace_manager(self.device, num_ubatches) + # Construct the model runner if self.use_v2_model_runner: from vllm.v1.worker.gpu.model_runner import ( diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py new file mode 100644 index 0000000000000..a16dde1f67800 --- /dev/null +++ b/vllm/v1/worker/workspace.py @@ -0,0 +1,245 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import inspect +import os +from itertools import accumulate +from math import prod +from typing import Optional + +import torch + +import vllm.envs as envs +from vllm.logger import init_logger +from vllm.utils.math_utils import round_up +from vllm.v1.worker.ubatching import dbo_current_ubatch_id + +logger = init_logger(__name__) + + +def _compute_bytes(shape: tuple[int, ...], dtype: torch.dtype) -> int: + return prod(shape) * dtype.itemsize + + +# Constants +_MB = 1024**2 +_GiB = 1024**3 + +# Global workspace manager instance +_manager: Optional["WorkspaceManager"] = None + + +class WorkspaceManager: + """Manager for workspace allocation. + + Manages workspace buffers for DBO (Dual Batch Overlap) execution. + Can be locked to prevent further growth during execution. + """ + + def __init__(self, device: torch.device, num_ubatches: int | None = None): + self._device = device + # Cache num ubatches at init based on configuration (default to 1) + self._num_ubatches = num_ubatches if num_ubatches is not None else 1 + self._current_workspaces: list[torch.Tensor | None] = [None, None] + self._locked: bool = False + + @staticmethod + def _workspace_size_bytes(workspace: torch.Tensor | None) -> int: + """Get size of workspace in bytes.""" + if workspace is None: + return 0 + return workspace.numel() * workspace.element_size() + + def lock(self) -> None: + """Lock the workspace to prevent further growth. + + After locking, any attempt to allocate a larger workspace will raise + an assertion error. This ensures workspace size is fixed during execution. + """ + self._locked = True + if envs.VLLM_DEBUG_WORKSPACE: + logger.info( + "[WORKSPACE DEBUG] Workspace locked. Current sizes: %s", + [ + self._workspace_size_bytes(ws) / _MB + for ws in self._current_workspaces + if ws is not None + ], + ) + + def is_locked(self) -> bool: + """Check if workspace is locked.""" + return self._locked + + def get_simultaneous( + self, *shapes_and_dtypes: tuple[tuple[int, ...], torch.dtype] + ) -> list[torch.Tensor]: + """Get multiple workspace tensors simultaneously from a single allocation. + + Args: + *shapes_and_dtypes: One or more (shape, dtype) tuples. + + Returns: + List of tensor views into the workspace buffer, one per shape/dtype pair. + """ + actual_bytes = [_compute_bytes(s, d) for s, d in shapes_and_dtypes] + aligned_bytes = [round_up(actual, 256) for actual in actual_bytes] + total_bytes = sum(aligned_bytes) + + # Calculate cumulative offsets using itertools.accumulate + offsets = list(accumulate([0] + aligned_bytes[:-1])) + + current_workspace = self._ensure_workspace_size(total_bytes) + + return [ + current_workspace[offsets[i] : offsets[i] + actual_bytes[i]] + .view(shapes_and_dtypes[i][1]) + .reshape(shapes_and_dtypes[i][0]) + for i in range(len(shapes_and_dtypes)) + ] + + def _ensure_workspace_size(self, required_bytes: int) -> torch.Tensor: + """Ensure workspace is allocated and large enough, return current workspace. + + Args: + required_bytes: The number of bytes required. + + Returns: + The current workspace tensor. + """ + ubatch_id = dbo_current_ubatch_id() + current_workspace = self._current_workspaces[ubatch_id] + current_size = self._workspace_size_bytes(current_workspace) + + if current_size < required_bytes: + + def get_caller_info() -> str: + """Find first frame outside WorkspaceManager.""" + curr_frame = inspect.currentframe() + if curr_frame is None: + return "unknown" + # Walk up the stack skipping WorkspaceManager frames + curr_frame = curr_frame.f_back + while curr_frame is not None: + # TODO: This only catches instance methods (self), missing + # classmethods and staticmethods. Once Python 3.11+ is the + # minimum supported version, use co_qualname instead: + # qualname = curr_frame.f_code.co_qualname + # if qualname.startswith("WorkspaceManager."): + if isinstance(curr_frame.f_locals.get("self"), WorkspaceManager): + curr_frame = curr_frame.f_back + continue + filename = os.path.basename(curr_frame.f_code.co_filename) + return ( + f"{filename}:{curr_frame.f_lineno}:{curr_frame.f_code.co_name}" + ) + return "unknown" + + if self._locked: + raise AssertionError( + f"Workspace is locked but allocation from '{get_caller_info()}' " + f"requires {required_bytes / _MB:.2f} MB, current size is " + f"{current_size / _MB:.2f} MB. " + "Workspace growth is not allowed after locking." + ) + + for ubatch_id in range(self._num_ubatches): + current_workspace = self._current_workspaces[ubatch_id] + if current_workspace is None: + self._current_workspaces[ubatch_id] = torch.empty( + (required_bytes,), dtype=torch.uint8, device=self._device + ) + elif self._workspace_size_bytes(current_workspace) < required_bytes: + current_workspace.resize_(required_bytes) + + if envs.VLLM_DEBUG_WORKSPACE: + logger.info( + "[WORKSPACE DEBUG] Resized workspace from '%s': %.2f MB -> " + "%.2f MB (%d ubatches, total memory %.2f MB)", + get_caller_info(), + current_size / _MB, + required_bytes / _MB, + self._num_ubatches, + required_bytes * self._num_ubatches / _MB, + ) + + current_workspace = self._current_workspaces[dbo_current_ubatch_id()] + + return current_workspace + + +def is_workspace_manager_initialized() -> bool: + """Check if workspace manager has been initialized. + + Returns: + True if workspace manager is initialized, False otherwise. + """ + return _manager is not None + + +def current_workspace_manager() -> "WorkspaceManager": + """Get the current workspace manager instance. + + Raises: + AssertionError: If workspace manager has not been initialized. + """ + assert _manager is not None, ( + "WorkspaceManager not initialized. Call init_workspace_manager() " + "with a device before using workspace functions." + ) + return _manager + + +def init_workspace_manager( + device: torch.device, num_ubatches: int | None = None +) -> None: + """Initialize the workspace manager with a device. + + Must be called before using any workspace functions. Typically called + from GPUModelRunner.__init__. + + Args: + device: The device to allocate workspace on. + num_ubatches: Number of micro-batches. Defaults to 1. + """ + global _manager + if _manager is not None: + logger.warning( + "WorkspaceManager already initialized on device %s, " + "reinitializing on device %s", + _manager._device, + device, + ) + _manager = WorkspaceManager(device, num_ubatches) + + +def lock_workspace() -> None: + """Lock the workspace to prevent further growth. + + After calling this function, any attempt to allocate a workspace larger + than the current size will raise an AssertionError. This ensures that + workspace size is fixed during execution and prevents unexpected memory + allocations in the hot path. + + Example: + # During initialization + init_workspace_manager(device) + reserve_workspace(shape1, dtype1) + reserve_workspace(shape2, dtype2) + + # Lock after warmup/profiling + lock_workspace() + + # Now all get_workspace calls must fit in pre-allocated size + """ + current_workspace_manager().lock() + + +def reset_workspace_manager() -> None: + """Reset the workspace manager to uninitialized state. + + This is primarily intended for testing purposes to allow tests + to reinitialize the workspace manager cleanly. + """ + global _manager + _manager = None From 3e34adcdfb1e3845e2fc4d0cd192d314eab516f0 Mon Sep 17 00:00:00 2001 From: Vladislav Nosivskoy <47858711+vladnosiv@users.noreply.github.com> Date: Fri, 12 Dec 2025 18:01:06 +0300 Subject: [PATCH 29/57] [DeepSeek V3.2] Proper drop_thinking logic (#30490) Signed-off-by: Vladislav Nosivskoy --- vllm/tokenizers/deepseekv32.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index 5c4936b5e7ad3..a7fa0f421725a 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -47,11 +47,13 @@ class DeepseekV32Tokenizer(HfTokenizer): thinking_mode = "chat" conversation = kwargs.get("conversation", messages) messages = conversation.copy() - drop_thinking = True if tools is not None and len(tools) > 0: messages.insert(0, {"role": "system"}) messages[0]["tools"] = tools - drop_thinking = False + + # Historical reasoning content is dropped when a new user message is introduced + drop_thinking = messages[-1]["role"] == "user" + encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking) prompt_str = encode_messages(messages, **encode_config) # type: ignore return prompt_str From dc13c99eedf837f22f60e9b5836abf147f5254f1 Mon Sep 17 00:00:00 2001 From: Christina Norman Date: Fri, 12 Dec 2025 09:10:12 -0600 Subject: [PATCH 30/57] fix(gguf): Disable bfloat16 for GGUF on blackwell device (#30408) Signed-off-by: Christina Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Christina Norman Co-authored-by: Isotr0py Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/quantization/gguf.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 13aa2bcad21ba..9dd734f2fea6a 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -33,6 +33,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ) from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op logger = init_logger(__name__) @@ -52,6 +53,11 @@ class GGUFConfig(QuantizationConfig): return "gguf" def get_supported_act_dtypes(self) -> list[torch.dtype]: + # GGUF dequantization kernels use half precision (fp16) internally. + # bfloat16 has precision issues on Blackwell devices. + if current_platform.has_device_capability(100): + logger.warning_once("GGUF has precision issues with bfloat16 on Blackwell.") + return [torch.half, torch.float32] return [torch.half, torch.bfloat16, torch.float32] @classmethod From 09ad3b76b320fffcb6b0214bd90851c3328581ea Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 12 Dec 2025 10:40:50 -0500 Subject: [PATCH 31/57] [Bug] Fix attention_backend arg string parsing (#30534) Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 757023e12d439..2867532756450 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1649,7 +1649,13 @@ class EngineArgs: "attention_backend and attention_config.backend " "are mutually exclusive" ) - attention_config.backend = self.attention_backend + # Convert string to enum if needed (CLI parsing returns a string) + if isinstance(self.attention_backend, str): + attention_config.backend = AttentionBackendEnum[ + self.attention_backend.upper() + ] + else: + attention_config.backend = self.attention_backend load_config = self.create_load_config() From 9c0ee995a81fbd87b397c956ca56fc94f784966e Mon Sep 17 00:00:00 2001 From: jvlunteren <161835099+jvlunteren@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:55:40 +0100 Subject: [PATCH 32/57] [Kernel] Support CUDA Graphs in 3D Triton Attention Kernel (#28306) Signed-off-by: Jan van Lunteren Signed-off-by: jvlunteren <161835099+jvlunteren@users.noreply.github.com> Co-authored-by: Thomas Parnell Co-authored-by: Thomas Parnell --- .../test_triton_unified_attention.py | 27 ++++++ .../attention/ops/triton_unified_attention.py | 69 +++++++-------- vllm/v1/attention/backends/triton_attn.py | 84 ++++++++++++++++++- 3 files changed, 140 insertions(+), 40 deletions(-) diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index bf4d2179af5f9..7fb08e5780f51 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -7,6 +7,7 @@ import torch from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.platforms import current_platform +from vllm.utils.math_utils import next_power_of_2 NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] @@ -22,6 +23,10 @@ QDTYPES = ( # one value small enough to test the schema op check NUM_BLOCKS = [32768, 2048] +# 0: use 2D kernel for decode +# 8: use 3D kernel for decode +SEQ_THRESHOLD_3D_VALUES = [0, 8] + def ref_paged_attn( query: torch.Tensor, @@ -92,6 +97,7 @@ def ref_paged_attn( @pytest.mark.parametrize("soft_cap", [None, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("q_dtype", QDTYPES) +@pytest.mark.parametrize("seq_threshold_3D", SEQ_THRESHOLD_3D_VALUES) @torch.inference_mode() def test_triton_unified_attn( seq_lens: list[tuple[int, int]], @@ -103,6 +109,7 @@ def test_triton_unified_attn( soft_cap: float | None, num_blocks: int, q_dtype: torch.dtype | None, + seq_threshold_3D: int, ) -> None: torch.set_default_device("cuda") @@ -152,6 +159,21 @@ def test_triton_unified_attn( k_descale = torch.rand(scale_shape, dtype=torch.float32) v_descale = torch.rand(scale_shape, dtype=torch.float32) + num_par_softmax_segments = 16 + head_size_padded = next_power_of_2(head_size) + softmax_segm_output = torch.empty( + (seq_threshold_3D, num_query_heads, num_par_softmax_segments, head_size_padded), + dtype=torch.float32, + ) + softmax_segm_max = torch.empty( + (seq_threshold_3D, num_query_heads, num_par_softmax_segments), + dtype=torch.float32, + ) + softmax_segm_expsum = torch.empty( + (seq_threshold_3D, num_query_heads, num_par_softmax_segments), + dtype=torch.float32, + ) + unified_attention( q=maybe_quantized_query, k=maybe_quantized_key_cache, @@ -169,6 +191,11 @@ def test_triton_unified_attn( q_descale=q_descale, k_descale=k_descale, v_descale=v_descale, + seq_threshold_3D=seq_threshold_3D, + num_par_softmax_segments=num_par_softmax_segments, + softmax_segm_output=softmax_segm_output, + softmax_segm_max=softmax_segm_max, + softmax_segm_expsum=softmax_segm_expsum, ) ref_output = ref_paged_attn( diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 565be1c39bec1..a1877bb4429b9 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -355,7 +355,7 @@ def kernel_unified_attention_2d( @triton.jit def kernel_unified_attention_3d( segm_output_ptr, - # [num_tokens, num_query_heads, num_segments, head_size] + # [num_tokens, num_query_heads, num_segments, head_size_padded] segm_max_ptr, # [num_tokens, num_query_heads, num_segments] segm_expsum_ptr, # [num_tokens, num_query_heads, num_segments] query_ptr, # [num_tokens, num_query_heads, head_size] @@ -749,6 +749,11 @@ def unified_attention( q_descale, k_descale, v_descale, + seq_threshold_3D=None, + num_par_softmax_segments=None, + softmax_segm_output=None, + softmax_segm_max=None, + softmax_segm_expsum=None, alibi_slopes=None, output_scale=None, qq_bias=None, @@ -793,8 +798,19 @@ def unified_attention( TILE_SIZE_PREFILL = 32 TILE_SIZE_DECODE = 16 if q.element_size() >= 2 else 32 - # if batch contains a prefill - if max_seqlen_q > 1 or total_num_q_blocks * num_kv_heads > 128: + # Launch the 2D kernel if + # 1. No intermediate tiled softmax buffers for the 3D kernel have been allocated, or + # 2. The batch includes at least one prefill request, or + # 3. The number of sequences exceeds the configured threshold + if ( + seq_threshold_3D is None + or num_par_softmax_segments is None + or softmax_segm_output is None + or softmax_segm_max is None + or softmax_segm_expsum is None + or max_seqlen_q > 1 + or num_seqs > seq_threshold_3D + ): kernel_unified_attention_2d[ ( total_num_q_blocks, @@ -847,37 +863,12 @@ def unified_attention( USE_FP8=output_scale is not None, ) else: - # for initial version, NUM_SEGMENTS = 16 is chosen as a default - # value that showed good performance in tests - NUM_SEGMENTS = 16 - - segm_output = torch.empty( - q.shape[0], - num_query_heads, - NUM_SEGMENTS, - triton.next_power_of_2(head_size), - dtype=torch.float32, - device=q.device, - ) - segm_max = torch.empty( - q.shape[0], - num_query_heads, - NUM_SEGMENTS, - dtype=torch.float32, - device=q.device, - ) - segm_expsum = torch.empty( - q.shape[0], - num_query_heads, - NUM_SEGMENTS, - dtype=torch.float32, - device=q.device, - ) - - kernel_unified_attention_3d[(total_num_q_blocks, num_kv_heads, NUM_SEGMENTS)]( - segm_output_ptr=segm_output, - segm_max_ptr=segm_max, - segm_expsum_ptr=segm_expsum, + kernel_unified_attention_3d[ + (total_num_q_blocks, num_kv_heads, num_par_softmax_segments) + ]( + segm_output_ptr=softmax_segm_output, + segm_max_ptr=softmax_segm_max, + segm_expsum_ptr=softmax_segm_expsum, query_ptr=q, key_cache_ptr=k, value_cache_ptr=v, @@ -917,13 +908,13 @@ def unified_attention( BLOCK_Q=BLOCK_Q, num_seqs=num_seqs, BLOCK_M=BLOCK_M, - NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, + NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments, ) reduce_segments[(q.shape[0], num_query_heads)]( output_ptr=out, - segm_output_ptr=segm_output, - segm_max_ptr=segm_max, - segm_expsum_ptr=segm_expsum, + segm_output_ptr=softmax_segm_output, + segm_max_ptr=softmax_segm_max, + segm_expsum_ptr=softmax_segm_expsum, seq_lens_ptr=seqused_k, num_seqs=num_seqs, num_query_heads=num_query_heads, @@ -936,6 +927,6 @@ def unified_attention( HEAD_SIZE_PADDED=triton.next_power_of_2(head_size), query_start_len_ptr=cu_seqlens_q, BLOCK_Q=BLOCK_Q, - NUM_SEGMENTS_PER_SEQ=NUM_SEGMENTS, + NUM_SEGMENTS_PER_SEQ=num_par_softmax_segments, USE_FP8=output_scale is not None, ) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 3b17c4bcd89cc..7bea3862a03f9 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -17,7 +17,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import ( triton_reshape_and_cache_flash, ) from vllm.attention.ops.triton_unified_attention import unified_attention -from vllm.config import VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.config.cache import CacheDType from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -26,6 +26,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.platforms import current_platform from vllm.platforms.interface import DeviceCapability +from vllm.utils.math_utils import next_power_of_2 from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -36,6 +37,11 @@ from vllm.v1.kv_cache_interface import AttentionSpec logger = init_logger(__name__) +# constants +MIN_LAUNCH_GRID_SIZE_2D = 128 # Minimum launch grid size of 2D kernel +NUM_PAR_SOFTMAX_SEGMENTS = 16 # Number of parallel tiled softmax segments + + @dataclass class TritonAttentionMetadata: # NOTE(sang): Definition of context_len, query_len, and seq_len. @@ -54,6 +60,12 @@ class TritonAttentionMetadata: block_table: torch.Tensor slot_mapping: torch.Tensor + seq_threshold_3D: int + num_par_softmax_segments: int + softmax_segm_output: torch.Tensor + softmax_segm_max: torch.Tensor + softmax_segm_expsum: torch.Tensor + # For cascade attention. use_cascade: bool common_prefix_len: int @@ -87,6 +99,60 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet self.num_heads_kv = model_config.get_num_kv_heads(vllm_config.parallel_config) self.headdim = model_config.get_head_size() + # Check if CUDA Graphs are enabled for decode + self.decode_cudagraph_enabled = ( + self.vllm_config.compilation_config.cudagraph_mode + in ( + CUDAGraphMode.FULL_AND_PIECEWISE, + CUDAGraphMode.FULL_DECODE_ONLY, + CUDAGraphMode.FULL, + ) + ) + + # The launch grid for the 2D kernel is defined as (num_q_blocks, num_heads_kv). + # A lower bound for num_q_blocks is the number of sequences. + # To ensure the minimum launch grid size is achieved, the number of sequences + # must be at least equal to the threshold below. + # If this threshold is not reached (i.e., the batch size is not large enough), + # the 3D kernel will be selected instead. + self.seq_threshold_3D = MIN_LAUNCH_GRID_SIZE_2D // self.num_heads_kv + + # Modify the threshold if needed. + if self.decode_cudagraph_enabled: + capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes + assert capture_sizes, "CUDA Graphs enabled but no capture sizes specified." + + # Select the CUDA Graph capture size closest to self.seq_threshold_3D + # as threshold. This ensures that each captured graph covers the + # correct execution path. + self.seq_threshold_3D = min( + capture_sizes, + key=lambda x: abs(x - self.seq_threshold_3D), + ) + + self.num_par_softmax_segments = NUM_PAR_SOFTMAX_SEGMENTS + headdim_padded = next_power_of_2(self.headdim) + self.softmax_segm_output = torch.empty( + ( + self.seq_threshold_3D, + self.num_heads_q, + self.num_par_softmax_segments, + headdim_padded, + ), + dtype=torch.float32, + device=device, + ) + self.softmax_segm_max = torch.empty( + (self.seq_threshold_3D, self.num_heads_q, self.num_par_softmax_segments), + dtype=torch.float32, + device=device, + ) + self.softmax_segm_expsum = torch.empty( + (self.seq_threshold_3D, self.num_heads_q, self.num_par_softmax_segments), + dtype=torch.float32, + device=device, + ) + def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata ) -> TritonAttentionMetadata: @@ -143,6 +209,11 @@ class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMet prefix_kv_lens=prefix_kv_lens, suffix_kv_lens=suffix_kv_lens, prefix_scheduler_metadata=prefix_scheduler_metadata, + seq_threshold_3D=self.seq_threshold_3D, + num_par_softmax_segments=self.num_par_softmax_segments, + softmax_segm_output=self.softmax_segm_output, + softmax_segm_max=self.softmax_segm_max, + softmax_segm_expsum=self.softmax_segm_expsum, ) return attn_metadata @@ -349,6 +420,12 @@ class TritonAttentionImpl(AttentionImpl): max_seqlen_k = attn_metadata.max_seq_len block_table = attn_metadata.block_table + seq_threshold_3D = attn_metadata.seq_threshold_3D + num_par_softmax_segments = attn_metadata.num_par_softmax_segments + softmax_segm_output = attn_metadata.softmax_segm_output + softmax_segm_max = attn_metadata.softmax_segm_max + softmax_segm_expsum = attn_metadata.softmax_segm_expsum + descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2]) unified_attention( @@ -369,6 +446,11 @@ class TritonAttentionImpl(AttentionImpl): q_descale=None, # Not supported k_descale=layer._k_scale.expand(descale_shape), v_descale=layer._v_scale.expand(descale_shape), + seq_threshold_3D=seq_threshold_3D, + num_par_softmax_segments=num_par_softmax_segments, + softmax_segm_output=softmax_segm_output, + softmax_segm_max=softmax_segm_max, + softmax_segm_expsum=softmax_segm_expsum, sinks=self.sinks, output_scale=output_scale, ) From f3237f3f6b1ce3ea3b1881a059811c2695ffe650 Mon Sep 17 00:00:00 2001 From: Benjamin Bartels Date: Fri, 12 Dec 2025 16:28:54 +0000 Subject: [PATCH 33/57] [Frontend] Fixes anthropic streaming message_start usage nesting (#30266) Signed-off-by: bbartels --- tests/entrypoints/openai/test_messages.py | 9 ++++++--- vllm/entrypoints/anthropic/serving_messages.py | 12 ++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/entrypoints/openai/test_messages.py b/tests/entrypoints/openai/test_messages.py index b804a1a7a841a..8de6c4cb6c887 100644 --- a/tests/entrypoints/openai/test_messages.py +++ b/tests/entrypoints/openai/test_messages.py @@ -79,9 +79,12 @@ async def test_anthropic_streaming(client: anthropic.AsyncAnthropic): assert chunk_count > 0 assert first_chunk is not None, "message_start chunk was never observed" - assert first_chunk.usage is not None, "first chunk should include usage stats" - assert first_chunk.usage["output_tokens"] == 0 - assert first_chunk.usage["input_tokens"] > 5 + assert first_chunk.message is not None, "first chunk should include message" + assert first_chunk.message.usage is not None, ( + "first chunk should include usage stats" + ) + assert first_chunk.message.usage.output_tokens == 0 + assert first_chunk.message.usage.input_tokens > 5 @pytest.mark.asyncio diff --git a/vllm/entrypoints/anthropic/serving_messages.py b/vllm/entrypoints/anthropic/serving_messages.py index e7ea3bb59ca70..25c2d88a2c7a4 100644 --- a/vllm/entrypoints/anthropic/serving_messages.py +++ b/vllm/entrypoints/anthropic/serving_messages.py @@ -324,12 +324,12 @@ class AnthropicServingMessages(OpenAIServingChat): id=origin_chunk.id, content=[], model=origin_chunk.model, - ), - usage=AnthropicUsage( - input_tokens=origin_chunk.usage.prompt_tokens - if origin_chunk.usage - else 0, - output_tokens=0, + usage=AnthropicUsage( + input_tokens=origin_chunk.usage.prompt_tokens + if origin_chunk.usage + else 0, + output_tokens=0, + ), ), ) first_item = False From d2c919dcc20b1ea77a94fa01e813ebbb31f8a66a Mon Sep 17 00:00:00 2001 From: realliujiaxu Date: Sat, 13 Dec 2025 01:03:35 +0800 Subject: [PATCH 34/57] [bugfix] fix bug when top_logprobs=0 with spec decoding (#30059) Signed-off-by: realliujiaxu --- tests/v1/sample/test_logprobs.py | 4 +++- tests/v1/sample/test_rejection_sampler.py | 2 +- vllm/v1/sample/rejection_sampler.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index c89c33be80c10..76a0e8e25a4ae 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -528,9 +528,11 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode): ), ], ) +@pytest.mark.parametrize("top_logprobs", [0, 3]) def test_spec_decode_logprobs( logprobs_mode: LogprobsMode, model_setup: tuple[str, str, str], + top_logprobs: int, ): """Spec decode logprobs should match those of the base model. @@ -543,7 +545,7 @@ def test_spec_decode_logprobs( prompt = "Hello world " * 50 sampling_params = SamplingParams( - temperature=0, logprobs=3, max_tokens=10, ignore_eos=False + temperature=0, logprobs=top_logprobs, max_tokens=10, ignore_eos=False ) method, model_name, spec_model_name = model_setup max_model_len = 256 diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index bf7726ebf907f..61caffee45daf 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -111,7 +111,7 @@ def create_sampling_metadata( top_p=top_p, top_k=top_k, generators=generators, - max_num_logprobs=0, + max_num_logprobs=None, no_penalties=no_penalties, prompt_token_ids=prompt_token_ids, frequency_penalties=frequency_penalties, diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index ccaf07e18c468..50b91d8292ee8 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -145,7 +145,7 @@ class RejectionSampler(nn.Module): ) logprobs_tensors = None - if sampling_metadata.max_num_logprobs: + if sampling_metadata.max_num_logprobs is not None: logprobs_tensors = self._get_logprobs_tensors( sampling_metadata.max_num_logprobs, metadata, From 02a58803948e6b493a9bde6d38b69423a638ae49 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 12 Dec 2025 13:05:34 -0500 Subject: [PATCH 35/57] [CI] Fix mypy for vllm/v1/executor (#30517) Signed-off-by: yewentao256 --- tools/pre_commit/mypy.py | 2 +- vllm/v1/executor/abstract.py | 2 +- vllm/v1/executor/multiproc_executor.py | 10 +++++++--- vllm/v1/executor/ray_executor.py | 6 +++--- vllm/v1/executor/uniproc_executor.py | 13 ++++++++----- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py index 724b393044266..3f7e0a069f869 100755 --- a/tools/pre_commit/mypy.py +++ b/tools/pre_commit/mypy.py @@ -43,6 +43,7 @@ FILES = [ "vllm/worker", "vllm/v1/core", "vllm/v1/engine", + "vllm/v1/executor", "vllm/v1/metrics", "vllm/v1/pool", "vllm/v1/sample", @@ -60,7 +61,6 @@ SEPARATE_GROUPS = [ "vllm/model_executor", # v1 related "vllm/v1/attention", - "vllm/v1/executor", "vllm/v1/kv_offload", "vllm/v1/spec_decode", "vllm/v1/structured_output", diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index db8303fcec501..8ada52435edae 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -219,7 +219,7 @@ class Executor(ABC): def sample_tokens( self, grammar_output: GrammarOutput | None, non_block: bool = False - ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: + ) -> ModelRunnerOutput | Future[ModelRunnerOutput]: output = self.collective_rpc( # type: ignore[call-overload] "sample_tokens", args=(grammar_output,), non_block=non_block ) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 7e8ebe25c4603..b42d026a3e15b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -294,8 +294,8 @@ class MultiprocExecutor(Executor): kwargs: dict | None = None, non_block: bool = False, unique_reply_rank: int | None = None, - kv_output_aggregator: KVOutputAggregator = None, - ) -> Any | list[Any] | Future[Any | list[Any]]: + kv_output_aggregator: KVOutputAggregator | None = None, + ) -> Any: """Returns single result if unique_reply_rank and/or kv_output_aggregator is provided, otherwise list.""" assert self.rpc_broadcast_mq is not None, ( @@ -476,6 +476,8 @@ class WorkerProc: """Wrapper that runs one Worker in a separate process.""" READY_STR = "READY" + rpc_broadcast_mq: MessageQueue | None + worker_response_mq: MessageQueue | None def _init_message_queues( self, input_shm_handle: Handle, vllm_config: VllmConfig @@ -487,7 +489,7 @@ class WorkerProc: ) # Initializes a message queue for sending the model output - self.worker_response_mq: MessageQueue = MessageQueue(1, 1) + self.worker_response_mq = MessageQueue(1, 1) self.peer_response_handles = [] else: # Initialize remote MessageQueue for receiving SchedulerOutput across nodes @@ -720,6 +722,7 @@ class WorkerProc: try: reader.close() worker = WorkerProc(*args, **kwargs) + assert worker.worker_response_mq is not None # Send READY once we know everything is loaded ready_writer.send( @@ -804,6 +807,7 @@ class WorkerProc: def worker_busy_loop(self, cancel: threading.Event | None = None): """Main busy loop for Multiprocessing Workers""" + assert self.rpc_broadcast_mq is not None while True: method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue( cancel=cancel, indefinite=True diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py index 406eafcd339b0..2fd64e5c2277c 100644 --- a/vllm/v1/executor/ray_executor.py +++ b/vllm/v1/executor/ray_executor.py @@ -413,7 +413,7 @@ class RayDistributedExecutor(Executor): self, grammar_output: "GrammarOutput | None", non_block: bool = False, - ) -> ModelRunnerOutput | Future[ModelRunnerOutput]: + ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: """Execute the model on the Ray workers. The scheduler output to use should have been provided in @@ -428,7 +428,7 @@ class RayDistributedExecutor(Executor): """ scheduler_output = self.scheduler_output if scheduler_output is None: - return COMPLETED_NONE_FUTURE if non_block else None # noqa + return COMPLETED_NONE_FUTURE if non_block else None self.scheduler_output = None @@ -439,7 +439,7 @@ class RayDistributedExecutor(Executor): scheduler_output: SchedulerOutput, grammar_output: "GrammarOutput | None", non_block: bool = False, - ) -> ModelRunnerOutput | Future[ModelRunnerOutput]: + ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]: # Build the compiled DAG for the first time. if self.forward_dag is None: # type: ignore self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index 095d3d1dac21b..b8ca922554304 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -67,7 +67,7 @@ class UniProcExecutor(Executor): kwargs: dict | None = None, non_block: bool = False, single_value: bool = False, - ) -> Any | list[Any] | Future[Any | list[Any]]: + ) -> Any: if kwargs is None: kwargs = {} @@ -79,10 +79,13 @@ class UniProcExecutor(Executor): result = run_method(self.driver_worker, method, args, kwargs) if isinstance(result, AsyncModelRunnerOutput): if (async_thread := self.async_output_thread) is not None: - get_output = result.get_output - if not single_value: - get_output = lambda go=result.get_output: [go()] - return async_thread.submit(get_output) + if single_value: + return async_thread.submit(result.get_output) + + def get_output_list() -> list[Any]: + return [result.get_output()] + + return async_thread.submit(get_output_list) result = result.get_output() future = Future[Any]() future.set_result(result if single_value else [result]) From cd7740ac5c3906b2913d58ade61f231ec3a93296 Mon Sep 17 00:00:00 2001 From: shivampr Date: Fri, 12 Dec 2025 10:28:20 -0800 Subject: [PATCH 36/57] [ROCm] Enable Triton ScaledMM fallback + kernel selection fix (#26668) Signed-off-by: Shivam Signed-off-by: Shivam --- .buildkite/test-pipeline.yaml | 2 +- .../test_scaled_mm_kernel_selection.py | 91 +++++++++++++++++++ .../kernels/scaled_mm/ScaledMMLinearKernel.py | 5 +- .../kernels/scaled_mm/__init__.py | 40 +++----- .../quantization/kernels/scaled_mm/aiter.py | 22 +++-- .../quantization/kernels/scaled_mm/cpu.py | 11 ++- .../quantization/kernels/scaled_mm/cutlass.py | 17 +++- .../quantization/kernels/scaled_mm/triton.py | 63 +++++++++---- .../quantization/kernels/scaled_mm/xla.py | 11 ++- 9 files changed, 193 insertions(+), 69 deletions(-) create mode 100644 tests/kernels/quantization/test_scaled_mm_kernel_selection.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 750e7c038351c..0a5b56f473c29 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -836,7 +836,7 @@ steps: - tests/models/multimodal no_gpu: true commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - "pip install git+https://github.com/TIGER-AI-Lab/Mantis.git || echo 'Mantis installation skipped (decord not available on CPU-only environment)'" - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py - label: Multi-Modal Processor Test diff --git a/tests/kernels/quantization/test_scaled_mm_kernel_selection.py b/tests/kernels/quantization/test_scaled_mm_kernel_selection.py new file mode 100644 index 0000000000000..2ed55931c8164 --- /dev/null +++ b/tests/kernels/quantization/test_scaled_mm_kernel_selection.py @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for ScaledMM kernel selection logic (CPU-only) + +Run `pytest tests/kernels/quantization/test_scaled_mm_kernel_selection.py`. +""" + +import inspect +from abc import ABC + +import pytest + +from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( + ScaledMMLinearLayerConfig, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import ( + AiterScaledMMLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import ( + CPUScaledMMLinearKernel, +) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 + ScaledMMLinearKernel, +) + +pytestmark = pytest.mark.cpu_test + + +def test_is_supported_is_abstract(): + """Test that is_supported() is properly defined as abstract.""" + assert issubclass(ScaledMMLinearKernel, ABC) + assert hasattr(ScaledMMLinearKernel, "is_supported") + + +def test_cpu_kernel_implements_is_supported(): + """Test that CPUScaledMMLinearKernel implements is_supported() method.""" + assert hasattr(CPUScaledMMLinearKernel, "is_supported"), ( + "CPUScaledMMLinearKernel missing is_supported() method" + ) + # Verify it's a classmethod by checking if it can be called with the class + # and by checking the method type + assert inspect.ismethod(CPUScaledMMLinearKernel.is_supported) or inspect.isfunction( + CPUScaledMMLinearKernel.is_supported + ), "CPUScaledMMLinearKernel.is_supported() should be a classmethod" + # Verify it can be called as a classmethod + result, reason = CPUScaledMMLinearKernel.is_supported() + assert isinstance(result, bool), "is_supported() should return a bool" + assert reason is None or isinstance(reason, str), "reason should be str or None" + + +def test_aiter_kernel_implements_is_supported(): + """Test that AiterScaledMMLinearKernel implements is_supported() method.""" + assert hasattr(AiterScaledMMLinearKernel, "is_supported"), ( + "AiterScaledMMLinearKernel missing is_supported() method" + ) + # Verify it's a classmethod by checking if it can be called with the class + # and by checking the method type + assert inspect.ismethod( + AiterScaledMMLinearKernel.is_supported + ) or inspect.isfunction(AiterScaledMMLinearKernel.is_supported), ( + "AiterScaledMMLinearKernel.is_supported() should be a classmethod" + ) + # Verify it can be called as a classmethod + # (will return False on CPU, which is expected) + result, reason = AiterScaledMMLinearKernel.is_supported() + assert isinstance(result, bool), "is_supported() should return a bool" + assert reason is None or isinstance(reason, str), "reason should be str or None" + # On CPU, it should return False with a reason about requiring ROCm + # This validates the method works correctly even on non-ROCm platforms + + +def test_cpu_kernel_accepts_all_configs(): + """Test that CPUScaledMMLinearKernel accepts all config combinations.""" + configs = [ + ScaledMMLinearLayerConfig( + is_channelwise=False, + is_static_input_scheme=True, + input_symmetric=True, + ), + ScaledMMLinearLayerConfig( + is_channelwise=True, + is_static_input_scheme=False, + input_symmetric=False, + ), + ] + + for config in configs: + can_impl, reason = CPUScaledMMLinearKernel.can_implement(config) + assert can_impl, ( + f"CPUScaledMMLinearKernel should accept config {config}: {reason}" + ) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 2a885ec899458..7be220f7a3734 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -17,7 +17,9 @@ class ScaledMMLinearLayerConfig: class ScaledMMLinearKernel(ABC): @classmethod @abstractmethod - def get_min_capability(cls) -> int: + def is_supported( + cls, compute_capability: int | None = None + ) -> tuple[bool, str | None]: raise NotImplementedError @classmethod @@ -35,6 +37,7 @@ class ScaledMMLinearKernel(ABC): azp_adj_param_name: str, ) -> None: assert self.can_implement(c) + assert self.is_supported() self.config = c self.w_q_name = w_q_param_name self.w_s_name = w_s_param_name diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index dd59e5d935dcb..bd1d399715305 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -27,7 +27,7 @@ from vllm.platforms import PlatformEnum, current_platform # in priority/performance order (when available) _POSSIBLE_KERNELS: dict[PlatformEnum, list[type[ScaledMMLinearKernel]]] = { PlatformEnum.CPU: [CPUScaledMMLinearKernel], - PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], + PlatformEnum.CUDA: [CutlassScaledMMLinearKernel, TritonScaledMMLinearKernel], PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel], PlatformEnum.TPU: [XLAScaledMMLinearKernel], } @@ -55,41 +55,25 @@ def choose_scaled_mm_linear_kernel( type[ScaledMMLinearKernel]: Chosen kernel. """ - if compute_capability is None: - _cc = current_platform.get_device_capability() - if _cc is not None: - compute_capability = _cc[0] * 10 + _cc[1] - failure_reasons = [] for kernel in _POSSIBLE_KERNELS[current_platform._enum]: if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "").split(","): - failure_reasons.append( - f" {kernel.__name__} disabled by environment variable" - ) + failure_reasons.append(f"{kernel.__name__}: disabled by env var") continue # If the current platform uses compute_capability, # make sure the kernel supports the compute cability. - if compute_capability is not None: - kernel_min_capability = kernel.get_min_capability() - if ( - kernel_min_capability is not None - and kernel_min_capability > compute_capability - ): - failure_reasons.append( - f"{kernel.__name__} requires capability " - f"{kernel_min_capability}, current compute capability " - f"is {compute_capability}" - ) - continue + is_supported, reason = kernel.is_supported(compute_capability) + if not is_supported: + failure_reasons.append(f"{kernel.__name__}: {reason}") + continue - can_implement, failure_reason = kernel.can_implement(config) - if can_implement: - return kernel - else: - failure_reasons.append( - f" {kernel.__name__} cannot implement due to: {failure_reason}" - ) + can_implement, reason = kernel.can_implement(config) + if not can_implement: + failure_reasons.append(f"{kernel.__name__}: {reason}") + continue + + return kernel raise ValueError( "Failed to find a kernel that can implement the " diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py index 038a92c516cec..971bd2005a23b 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -14,17 +14,21 @@ from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel): @classmethod - def get_min_capability(cls) -> int: - return 90 - - @classmethod - def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + def is_supported( + cls, compute_capability: int | None = None + ) -> tuple[bool, str | None]: if not current_platform.is_rocm(): return ( False, "AiterScaledMMLinearKernel requires `aiter` which is not " + "currently supported on non-ROCm platform.", ) + if compute_capability is None: + _cc = current_platform.get_device_capability() + if _cc is not None: + compute_capability = _cc.major * 10 + _cc.minor + if compute_capability is not None and compute_capability < 90: + return False, f"requires capability 90, got {compute_capability}" try: import aiter # noqa: F401 # deliberately attempt to import aiter @@ -34,8 +38,8 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel): "AiterScaledMMLinearKernel requires `aiter` which is not " + "installed on ROCm.", ) - # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled - if not (rocm_aiter_ops.is_linear_enabled()): + + if not rocm_aiter_ops.is_linear_enabled(): return ( False, "AiterScaledMMLinearKernel is disabled. " @@ -44,6 +48,10 @@ class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel): + "`VLLM_ROCM_USE_AITER_LINEAR` default is True.", ) + return True, None + + @classmethod + def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: if not c.input_symmetric: return ( False, diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py index feb1e0bee1aaf..6401b94d6278b 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py @@ -19,14 +19,15 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi class CPUScaledMMLinearKernel(ScaledMMLinearKernel): @classmethod - def get_min_capability(cls) -> int: - return 75 + def is_supported( + cls, compute_capability: int | None = None + ) -> tuple[bool, str | None]: + if not current_platform.is_cpu(): + return False, "Requires CPU." + return True, None @classmethod def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: - if not current_platform.is_cpu(): - return False, "CPUScaledMM requires running on CPU." - return True, None def process_weights_after_loading(self, layer: torch.nn.Module) -> None: diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index e8769916b4cef..2f00e0df8ed47 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -16,14 +16,21 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi class CutlassScaledMMLinearKernel(ScaledMMLinearKernel): @classmethod - def get_min_capability(cls) -> int: - return 75 + def is_supported( + cls, compute_capability: int | None = None + ) -> tuple[bool, str | None]: + if not current_platform.is_cuda(): + return False, "Requires CUDA." + if compute_capability is None: + _cc = current_platform.get_device_capability() + if _cc is not None: + compute_capability = _cc.major * 10 + _cc.minor + if compute_capability is not None and compute_capability < 75: + return False, f"requires capability 75, got {compute_capability}" + return True, None @classmethod def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: - if not current_platform.is_cuda(): - return False, "CutlassScaledMM requires running on CUDA." - return True, None def process_weights_after_loading(self, layer: torch.nn.Module) -> None: diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py index 3f4ec7f2a738b..760f1f7f79576 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -4,34 +4,53 @@ import torch +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.compressed_tensors.triton_scaled_mm import ( # noqa: E501 + triton_scaled_mm, +) +from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.platforms import current_platform -from .cutlass import CutlassScaledMMLinearKernel -from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig +from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfig -class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel): +class TritonScaledMMLinearKernel(ScaledMMLinearKernel): @classmethod - def get_min_capability(cls) -> int: - return 75 + def is_supported( + cls, compute_capability: int | None = None + ) -> tuple[bool, str | None]: + if current_platform.is_cuda_alike(): + return True, None + return False, "Requires ROCm or CUDA." @classmethod def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: - if current_platform.is_cpu(): - return ( - False, - "TritonScaledMMLinearKernel requires Triton which is not " - + "currently supported on CPU.", - ) if not c.input_symmetric: - return ( - False, - "TritonScaledMMLinearKernel only supports symmetric " + "quantization.", - ) + return False, "Only symmetric input is supported." return True, None def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - super().process_weights_after_loading(layer) + weight = getattr(layer, self.w_q_name) + replace_parameter( + layer, + self.w_q_name, + torch.nn.Parameter(weight.t().data, requires_grad=False), + ) + + # INPUT SCALE + if self.config.is_static_input_scheme: + input_scale = getattr(layer, self.i_s_name) + replace_parameter( + layer, + self.i_s_name, + torch.nn.Parameter(input_scale.max(), requires_grad=False), + ) + setattr(layer, self.i_zp_name, None) + else: + setattr(layer, self.i_s_name, None) + setattr(layer, self.i_zp_name, None) + + setattr(layer, self.azp_adj_name, None) def apply_weights( self, @@ -39,4 +58,14 @@ class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel): x: torch.Tensor, bias: torch.Tensor | None = None, ) -> torch.Tensor: - return super().apply_weights(layer, x, bias) + w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer) + + x_q, x_s, x_zp = ops.scaled_int8_quant( + x.contiguous(), i_s, i_zp, symmetric=True + ) + + assert x_zp is None, "Triton kernel only supports symmetric quantization" + + return triton_scaled_mm( + x_q, w_q, scale_a=x_s, scale_b=w_s, out_dtype=x.dtype, bias=bias + ) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py index ddac9f13cf4f3..0be858c51993d 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py @@ -17,11 +17,12 @@ from .ScaledMMLinearKernel import ScaledMMLinearKernel, ScaledMMLinearLayerConfi class XLAScaledMMLinearKernel(ScaledMMLinearKernel): @classmethod - def get_min_capability(cls) -> int: - raise NotImplementedError( - "TPU platform does have a concept of compute capability, " - "this method should not be called." - ) + def is_supported( + cls, compute_capability: int | None = None + ) -> tuple[bool, str | None]: + if not current_platform.is_tpu(): + return False, "Requires TPU." + return True, None @classmethod def can_implement(cls, c: ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: From 1f19d8f899b228a530d256bf9476d9b1ea3039af Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Fri, 12 Dec 2025 11:07:57 -0800 Subject: [PATCH 37/57] [Perf] Set split_k to 1 for triton_kernels (#30528) Signed-off-by: Xin Yang --- .../layers/quantization/utils/mxfp4_utils.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index d0c8b3d1a3093..7a351afb3c415 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -57,12 +57,18 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): mx_axis=1, num_warps=num_warps ) ) - if current_platform.is_cuda() and current_platform.is_device_capability(100): - constraints = { - "is_persistent": True, - "epilogue_subtile": 1, - } - opt_flags.update_opt_flags_constraints(constraints) + if current_platform.is_cuda(): + if current_platform.is_device_capability(90): + constraints = { + "split_k": 1, + } + opt_flags.update_opt_flags_constraints(constraints) + elif current_platform.is_device_capability(100): + constraints = { + "is_persistent": True, + "epilogue_subtile": 1, + } + opt_flags.update_opt_flags_constraints(constraints) # transpose the tensor so that the quantization axis is on dim1 quant_tensor = quant_tensor.transpose(-2, -1) scale = scale.transpose(-2, -1) From 9693dd0fe382e10abf239e33dbb1707cebc18ff9 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Sat, 13 Dec 2025 03:21:35 +0800 Subject: [PATCH 38/57] [CI/Build] Add x86 CPU wheel release pipeline (#28848) Signed-off-by: jiang1.li --- .buildkite/release-pipeline.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 151bb6abb0905..a9d51557bd9bb 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -71,6 +71,20 @@ steps: env: DOCKER_BUILDKIT: "1" + # x86 CPU wheel build + - label: "Build x86 CPU wheel" + depends_on: ~ + id: build-wheel-x86-cpu + agents: + queue: cpu_queue_postmerge + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --build-arg VLLM_CPU_AMXBF16=true --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" + env: + DOCKER_BUILDKIT: "1" + # Build release images (12.9) - label: "Build release image (x86)" depends_on: ~ From 6ec0d8dbe4ccff35d042fafa29f2c141e553e7ae Mon Sep 17 00:00:00 2001 From: danielafrimi <45691845+danielafrimi@users.noreply.github.com> Date: Fri, 12 Dec 2025 21:27:47 +0200 Subject: [PATCH 39/57] [Fix]Load kv-cache dtype from hf_quant_config.json automatically (#29980) Signed-off-by: Daniel Afrimi --- vllm/utils/torch_utils.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index c97efce312b56..edcb79fbc9cd7 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -194,12 +194,33 @@ def get_kv_cache_torch_dtype( return torch_dtype +def get_kv_cache_quant_algo_dtype(quant_cfg: dict[str, Any]) -> torch.dtype | None: + quant_method = quant_cfg.get("quant_method", "") + if quant_method.startswith("modelopt"): + quantization_inner = quant_cfg.get("quantization", quant_cfg) + # Check if quant config is specified and use kv cache quant algo + kv_algo = quantization_inner.get("kv_cache_quant_algo") or quant_cfg.get( + "kv_cache_quant_algo" + ) + if isinstance(kv_algo, str): + return STR_DTYPE_TO_TORCH_DTYPE[kv_algo.lower()] + return None + + def kv_cache_dtype_str_to_dtype( kv_cache_dtype: str, model_config: ModelConfig ) -> torch.dtype: + # Model config may not be specified for unit tests, default to float16 + dtype = model_config.dtype if model_config else torch.half if kv_cache_dtype == "auto": - # Model config may not be specified for unit tests, default to float16 - return model_config.dtype if model_config else torch.half + hf_cfg = getattr(model_config, "hf_config", None) + if hf_cfg is not None: + quant_cfg = getattr(hf_cfg, "quantization_config", None) + if quant_cfg is not None: + kv_algo_dtype = get_kv_cache_quant_algo_dtype(quant_cfg) + return kv_algo_dtype if kv_algo_dtype is not None else dtype + return dtype + return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype] From 13618626dff735a794cae3dec4ac4c3d78de2e86 Mon Sep 17 00:00:00 2001 From: danielafrimi <45691845+danielafrimi@users.noreply.github.com> Date: Fri, 12 Dec 2025 22:42:32 +0200 Subject: [PATCH 40/57] [MoE-FP8-modelopt] Add FlashInfer alignment padding for intermediate dimensions (#29748) Signed-off-by: Daniel Afrimi Signed-off-by: dafrimi Co-authored-by: Daniel Afrimi Co-authored-by: Tyler Michael Smith --- .../layers/quantization/modelopt.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 18a0fe6fbbb44..a3a8ec738dae2 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -81,6 +81,7 @@ from vllm.utils.flashinfer import ( has_flashinfer, has_flashinfer_moe, ) +from vllm.utils.math_utils import round_up if TYPE_CHECKING: from vllm.model_executor.models.utils import WeightsMapper @@ -607,6 +608,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): Only supports pre-quantized checkpoints with FP8 weights and scales. """ + if self.flashinfer_moe_backend is not None: + self._maybe_pad_intermediate_for_flashinfer(layer) + layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False) layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False) @@ -684,6 +688,50 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight) register_moe_scaling_factors(layer) + def _maybe_pad_intermediate_for_flashinfer(self, layer: torch.nn.Module) -> None: + """Pad intermediate size so FlashInfer kernels' alignment constraints hold. + + Some FlashInfer FP8 MoE kernels require the (gated) intermediate size + used for GEMM to be divisible by a small alignment value. When this is + not satisfied (e.g. with certain tensor-parallel sizes), we pad the + gate/up and down projection weights along the intermediate dim. + """ + if not hasattr(layer, "w13_weight") or not hasattr(layer, "w2_weight"): + return + + # Current local intermediate size (per partition) is the K dimension of + # the down projection. + num_experts, hidden_size, intermediate = layer.w2_weight.shape + + min_alignment = 16 + padded_intermediate = round_up(intermediate, min_alignment) + + if padded_intermediate == intermediate: + return + + logger.info( + "Padding intermediate size from %d to %d for up/down projection weights.", + intermediate, + padded_intermediate, + ) + + up_mult = 2 if self.moe.is_act_and_mul else 1 + padded_gate_up_dim = up_mult * padded_intermediate + + # Pad w13 and w12 along its intermediate dimension. + w13 = layer.w13_weight.data + padded_w13 = w13.new_zeros((num_experts, padded_gate_up_dim, hidden_size)) + padded_w13[:, : w13.shape[1], :] = w13 + layer.w13_weight.data = padded_w13 + + w2 = layer.w2_weight.data + padded_w2 = w2.new_zeros((num_experts, hidden_size, padded_intermediate)) + padded_w2[:, :, :intermediate] = w2 + layer.w2_weight.data = padded_w2 + + if hasattr(layer, "intermediate_size_per_partition"): + layer.intermediate_size_per_partition = padded_intermediate + def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: From 1e6b115300b8b3629d10e69db0933246fe3253af Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:45:23 -0500 Subject: [PATCH 41/57] [Refactor] Reduce duplicate code in `per_token_group_quant` cuda kernels (#30496) Signed-off-by: yewentao256 --- .../w8a8/fp8/per_token_group_quant.cu | 181 ++++++++---------- 1 file changed, 83 insertions(+), 98 deletions(-) diff --git a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu index f9ac874c43730..49d1b2086b8db 100644 --- a/csrc/quantization/w8a8/fp8/per_token_group_quant.cu +++ b/csrc/quantization/w8a8/fp8/per_token_group_quant.cu @@ -22,6 +22,62 @@ __device__ __forceinline__ float GroupReduceMax(float val) { return val; } +template +__device__ __forceinline__ float ComputeGroupScale( + const T* __restrict__ group_input, T* __restrict__ smem_group, + const int group_size, const int lane_id, const int threads_per_group, + const float eps, const float max_8bit) { + float local_absmax = eps; + + constexpr int vec_size = 16 / sizeof(T); + + // copy global -> shared & compute absmax + auto scalar_op_cache = [&] __device__(T & dst, const T& src) { + float abs_v = fabsf(static_cast(src)); + local_absmax = fmaxf(local_absmax, abs_v); + dst = src; + }; + + vllm::vectorize_with_alignment( + group_input, // in + smem_group, // out (shared) + group_size, // elements per group + lane_id, // thread id + threads_per_group, // stride in group + scalar_op_cache); // scalar handler + + local_absmax = GroupReduceMax(local_absmax); + + float y_s = local_absmax / max_8bit; + if constexpr (SCALE_UE8M0) { + y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f)))); + } + + return y_s; +} + +template +__device__ __forceinline__ void QuantizeGroup( + const T* __restrict__ smem_group, DST_DTYPE* __restrict__ group_output, + const int group_size, const int lane_id, const int threads_per_group, + const float y_s, const float min_8bit, const float max_8bit) { + constexpr int vec_size = 16 / sizeof(T); + + // quantize shared -> global 8-bit + auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) { + float q = fminf(fmaxf(static_cast(src) / y_s, min_8bit), max_8bit); + dst = DST_DTYPE(q); + }; + + vllm::vectorize_with_alignment( + smem_group, // in (shared) + group_output, // out (global quant tensor) + group_size, // elements + lane_id, // tid + threads_per_group, // stride + scalar_op_quant); // scalar handler +} + template __global__ void per_token_group_quant_8bit_kernel( @@ -38,8 +94,6 @@ __global__ void per_token_group_quant_8bit_kernel( const int64_t global_group_id = block_group_id + local_group_id; const int64_t block_group_offset = global_group_id * group_size; - float local_absmax = eps; - using scale_element_t = float; static_assert(sizeof(scale_packed_t) % sizeof(scale_element_t) == 0); @@ -68,30 +122,9 @@ __global__ void per_token_group_quant_8bit_kernel( T* smem = reinterpret_cast(smem_raw); T* smem_group = smem + local_group_id * group_size; - constexpr int vec_size = 16 / sizeof(T); - using vec_t = vllm::vec_n_t; - - // copy global -> shared & compute absmax - auto scalar_op_cache = [&] __device__(T & dst, const T& src) { - float abs_v = fabsf(static_cast(src)); - local_absmax = fmaxf(local_absmax, abs_v); - dst = src; - }; - - vllm::vectorize_with_alignment( - group_input, // in - smem_group, // out (shared) - group_size, // elements per group - lane_id, // thread id - threads_per_group, // stride in group - scalar_op_cache); // scalar handler - - local_absmax = GroupReduceMax(local_absmax); - - float y_s = local_absmax / max_8bit; - if constexpr (SCALE_UE8M0) { - y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f)))); - } + const float y_s = ComputeGroupScale( + group_input, smem_group, group_size, lane_id, threads_per_group, eps, + max_8bit); scale_element_t y_s_quant = y_s; @@ -101,19 +134,24 @@ __global__ void per_token_group_quant_8bit_kernel( __syncthreads(); - // quantize shared -> global 8-bit - auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) { - float q = fminf(fmaxf(static_cast(src) / y_s, min_8bit), max_8bit); - dst = DST_DTYPE(q); - }; + QuantizeGroup(smem_group, group_output, group_size, lane_id, + threads_per_group, y_s, min_8bit, max_8bit); +} - vllm::vectorize_with_alignment( - smem_group, // in (shared) - group_output, // out (global quant tensor) - group_size, // elements - lane_id, // tid - threads_per_group, // stride - scalar_op_quant); // scalar handler +inline int GetGroupsPerBlock(int64_t num_groups) { + if (num_groups % 16 == 0) { + return 16; + } + if (num_groups % 8 == 0) { + return 8; + } + if (num_groups % 4 == 0) { + return 4; + } + if (num_groups % 2 == 0) { + return 2; + } + return 1; } void per_token_group_quant_8bit(const torch::Tensor& input, @@ -133,17 +171,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input, constexpr int THREADS_PER_GROUP = 16; - int groups_per_block = 1; - - if (num_groups % 16 == 0) { - groups_per_block = 16; - } else if (num_groups % 8 == 0) { - groups_per_block = 8; - } else if (num_groups % 4 == 0) { - groups_per_block = 4; - } else if (num_groups % 2 == 0) { - groups_per_block = 2; - } + const int groups_per_block = GetGroupsPerBlock(num_groups); auto dst_type = output_q.scalar_type(); const int num_blocks = num_groups / groups_per_block; @@ -225,8 +253,6 @@ __global__ void per_token_group_quant_8bit_packed_kernel( const int64_t block_group_offset = global_group_id * group_size; - float local_absmax = eps; - const T* group_input = input + block_group_offset; DST_DTYPE* group_output = static_cast(output_q) + block_group_offset; @@ -235,29 +261,9 @@ __global__ void per_token_group_quant_8bit_packed_kernel( extern __shared__ __align__(16) char smem_raw[]; T* smem = reinterpret_cast(smem_raw); T* smem_group = smem + local_group_id * group_size; - - constexpr int vec_size = 16 / sizeof(T); - using vec_t = vllm::vec_n_t; - - // copy global -> shared & compute absmax - auto scalar_op_cache = [&] __device__(T & dst, const T& src) { - float abs_v = fabsf(static_cast(src)); - local_absmax = fmaxf(local_absmax, abs_v); - dst = src; - }; - - vllm::vectorize_with_alignment( - group_input, // in - smem_group, // out (shared) - group_size, // elements per group - lane_id, // thread id - threads_per_group, // stride in group - scalar_op_cache); // scalar handler - - local_absmax = GroupReduceMax(local_absmax); - - float y_s = local_absmax / max_8bit; - y_s = exp2f(ceilf(log2f(fmaxf(fabsf(y_s), 1e-10f)))); + const float y_s = + ComputeGroupScale(group_input, smem_group, group_size, lane_id, + threads_per_group, eps, max_8bit); // pack 4 scales into a uint32 if (lane_id == 0) { @@ -284,19 +290,8 @@ __global__ void per_token_group_quant_8bit_packed_kernel( __syncthreads(); - // quantize shared -> global 8-bit - auto scalar_op_quant = [&] __device__(DST_DTYPE & dst, const T& src) { - float q = fminf(fmaxf(static_cast(src) / y_s, min_8bit), max_8bit); - dst = DST_DTYPE(q); - }; - - vllm::vectorize_with_alignment( - smem_group, // in (shared) - group_output, // out (global quant tensor) - group_size, // elements - lane_id, // tid - threads_per_group, // stride - scalar_op_quant); // scalar handler + QuantizeGroup(smem_group, group_output, group_size, lane_id, + threads_per_group, y_s, min_8bit, max_8bit); } void per_token_group_quant_8bit_packed(const torch::Tensor& input, @@ -337,17 +332,7 @@ void per_token_group_quant_8bit_packed(const torch::Tensor& input, constexpr int THREADS_PER_GROUP = 16; - int groups_per_block = 1; - - if (num_groups % 16 == 0) { - groups_per_block = 16; - } else if (num_groups % 8 == 0) { - groups_per_block = 8; - } else if (num_groups % 4 == 0) { - groups_per_block = 4; - } else if (num_groups % 2 == 0) { - groups_per_block = 2; - } + const int groups_per_block = GetGroupsPerBlock(num_groups); auto dst_type = output_q.scalar_type(); const int num_blocks = num_groups / groups_per_block; From b4039c08b5dd0a14d5c1ccac8557aa11732d857c Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Fri, 12 Dec 2025 14:13:09 -0800 Subject: [PATCH 42/57] [ci] Mark PrimeRL integration test as soft fail (#30578) Signed-off-by: Kevin H. Luu --- .buildkite/test-pipeline.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0a5b56f473c29..242a110cec3b9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1346,6 +1346,7 @@ steps: - label: Prime-RL Integration Test # 15min timeout_in_minutes: 30 optional: true + soft_fail: true num_gpus: 2 working_dir: "/vllm-workspace" source_file_dependencies: @@ -1379,4 +1380,4 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 From 08f8a5627e6306312202d5c0fda8b7255d2f27fc Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 12 Dec 2025 17:41:56 -0600 Subject: [PATCH 43/57] [CI/Build][Kernel][BugFix][AMD] Fix per_token_group_quant_fp8 to use correct fp8 min/max values and update atol/rtol in test_quantfp8_group_functionality (#30292) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/quantization/test_fp8_quant_group.py | 4 ++-- vllm/model_executor/layers/quantization/utils/fp8_utils.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/kernels/quantization/test_fp8_quant_group.py b/tests/kernels/quantization/test_fp8_quant_group.py index 6628ac650fd5f..f5e1cde94b6e9 100644 --- a/tests/kernels/quantization/test_fp8_quant_group.py +++ b/tests/kernels/quantization/test_fp8_quant_group.py @@ -62,7 +62,7 @@ def test_quantfp8_group_functionality( assert scales_col.stride(1) == batch_size # Test column-major scales consistency - assert torch.allclose(scales_col, scales_native, rtol=1e-9, atol=1e-8) + torch.testing.assert_close(scales_col, scales_native, rtol=1e-9, atol=1e-8) # 3. Test CUDA implementation (only for divisible dimensions) if is_divisible: @@ -71,7 +71,7 @@ def test_quantfp8_group_functionality( assert scales_cuda.shape == (batch_size, expected_num_groups) # Verify CUDA/native consistency - assert torch.allclose(scales_cuda, scales_native, rtol=1e-9, atol=1e-8) + torch.testing.assert_close(scales_cuda, scales_native, rtol=2e-7, atol=2e-8) # Quantized values should mostly match diff_count = (x_quant_cuda != x_quant_native).sum().item() diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index e12fe61bf3d97..9eeb6e266c34e 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -762,9 +762,12 @@ def per_token_group_quant_fp8( ) assert x.stride(-1) == 1, "`x` groups must be contiguous" + # Using the default value (240.0) from pytorch will cause accuracy + # issue on dynamic quantization models. Here use 224.0 for fnuz on ROCm + # platforms that use the torch.float8_e4mefnuz dtype. finfo = torch.finfo(dtype) - fp8_min = finfo.min - fp8_max = finfo.max + fp8_min = -224.0 if current_platform.is_fp8_fnuz() else finfo.min + fp8_max = 224.0 if current_platform.is_fp8_fnuz() else finfo.max assert out_q is None or out_q.shape == x.shape x_q = out_q From 86a3261525858bca5d4a234691fae0496d6fed99 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Fri, 12 Dec 2025 19:02:11 -0500 Subject: [PATCH 44/57] [Bugfix] Pass FA version in `MultiHeadAttention` (#30575) Signed-off-by: Matthew Bonanni --- vllm/attention/layer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index c77fc0fad0038..c095b94518143 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer.""" +import functools from collections.abc import Callable from typing import cast @@ -17,6 +18,7 @@ from vllm.attention.backends.abstract import ( ) from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.selector import get_attn_backend +from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target from vllm.attention.utils.kv_transfer_utils import maybe_transfer_kv_layer from vllm.config import CacheConfig, get_current_vllm_config @@ -524,6 +526,14 @@ class MultiHeadAttention(nn.Module): AttentionBackendEnum.ROCM_AITER_FA, } + self.fa_version = None + if self.attn_backend == AttentionBackendEnum.FLASH_ATTN: + self.fa_version = get_flash_attn_version() + assert self._flash_attn_varlen_func is not None + self._flash_attn_varlen_func = functools.partial( + self._flash_attn_varlen_func, fa_version=self.fa_version + ) + logger.info_once( f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder." ) From fc0119425c8bef73cbeadc265d70777103b3e940 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 12 Dec 2025 20:34:23 -0500 Subject: [PATCH 45/57] Add IBM and Red Hat to compute resources sponsors (#30581) Signed-off-by: Michael Goin --- README.md | 2 ++ docs/community/sponsors.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index 5c040fe4a66d2..26222b815370d 100644 --- a/README.md +++ b/README.md @@ -143,11 +143,13 @@ Compute Resources: - Databricks - DeepInfra - Google Cloud +- IBM - Intel - Lambda Lab - Nebius - Novita AI - NVIDIA +- Red Hat - Replicate - Roblox - RunPod diff --git a/docs/community/sponsors.md b/docs/community/sponsors.md index fd1c82376d086..847b99cce45c9 100644 --- a/docs/community/sponsors.md +++ b/docs/community/sponsors.md @@ -24,11 +24,13 @@ Compute Resources: - Databricks - DeepInfra - Google Cloud +- IBM - Intel - Lambda Lab - Nebius - Novita AI - NVIDIA +- Red Hat - Replicate - Roblox - RunPod From f5dfbbd8e9f35d68b924677d574aa01857a07781 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Fri, 12 Dec 2025 21:20:15 -0500 Subject: [PATCH 46/57] [Docs] Remove references to `VLLM_ATTENTION_BACKEND` (#30564) Signed-off-by: Matthew Bonanni --- docs/getting_started/quickstart.md | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 94920dc5306b3..e3974354d8f3b 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -281,17 +281,27 @@ Alternatively, you can use the `openai` Python package: Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications. -If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: +If desired, you can also manually set the backend of your choice using the `--attention-backend` CLI argument: + +```bash +# For online serving +vllm serve Qwen/Qwen2.5-1.5B-Instruct --attention-backend FLASH_ATTN + +# For offline inference +python script.py --attention-backend FLASHINFER +``` + +Some of the available backend options include: - On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`. - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`. -For AMD ROCm, you can further control the specific Attention implementation using the following variables: +For AMD ROCm, you can further control the specific Attention implementation using the following options: -- Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0` -- AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0` -- Triton Prefill-Decode Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0` -- AITER Multi-head Attention: `VLLM_ROCM_USE_AITER=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=1` +- Triton Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=0 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument. +- AITER Unified Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument. +- Triton Prefill-Decode Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=0` and pass `--attention-config.use_prefill_decode_attention=true` as a CLI argument. +- AITER Multi-head Attention: Set the environment variables `VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_MHA=1` and pass `--attention-config.use_prefill_decode_attention=false` as a CLI argument. !!! warning There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [docker/Dockerfile](../../docker/Dockerfile) for instructions on how to install it. From 2f32a68d75324299d13025c75f9cb5427e5c445d Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 12 Dec 2025 21:28:13 -0500 Subject: [PATCH 47/57] [CI] Update several models in registry that are available online now (#30514) Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: Isotr0py <2037008807@qq.com> --- .buildkite/test-pipeline.yaml | 2 ++ tests/models/registry.py | 10 ++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 242a110cec3b9..5fcf945f3e5a6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -692,6 +692,7 @@ steps: source_file_dependencies: - vllm/ - tests/models/test_initialization.py + - tests/models/registry.py commands: # Run a subset of model initialization tests - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset @@ -704,6 +705,7 @@ steps: - vllm/model_executor/models/ - vllm/transformers_utils/ - tests/models/test_initialization.py + - tests/models/registry.py commands: # Only when vLLM model source is modified - test initialization of a large # subset of supported models (the complement of the small subset in the above diff --git a/tests/models/registry.py b/tests/models/registry.py index 18056a9657e82..769b33d877983 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -356,7 +356,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ), "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), "MistralLarge3ForCausalLM": _HfExamplesInfo( - "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4", is_available_online=False + "mistralai/Mistral-Large-3-675B-Instruct-2512-NVFP4" ), "MixtralForCausalLM": _HfExamplesInfo( "mistralai/Mixtral-8x7B-Instruct-v0.1", @@ -635,7 +635,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { ), "HunYuanVLForConditionalGeneration": _HfExamplesInfo( "tencent/HunyuanOCR", - is_available_online=False, + hf_overrides={"num_experts": 0}, ), "Idefics3ForConditionalGeneration": _HfExamplesInfo( "HuggingFaceM4/Idefics3-8B-Llama3", @@ -674,8 +674,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/discussions/31", ), "LightOnOCRForConditionalGeneration": _HfExamplesInfo( - "lightonai/LightOnOCR-1B", - is_available_online=False, + "lightonai/LightOnOCR-1B-1025" ), "Llama4ForConditionalGeneration": _HfExamplesInfo( "meta-llama/Llama-4-Scout-17B-16E-Instruct", @@ -779,8 +778,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { "ministral-3": "mistralai/Ministral-3-3B-Instruct-2512", }, tokenizer_mode="mistral", - # TODO: revert once Mistral-Large-3 and Ministral-3 are publicly available. - is_available_online=False, ), "QwenVLForConditionalGeneration": _HfExamplesInfo( "Qwen/Qwen-VL", @@ -886,6 +883,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { "EagleMistralLarge3ForCausalLM": _HfExamplesInfo( "mistralai/Mistral-Large-3-675B-Instruct-2512", speculative_model="mistralai/Mistral-Large-3-675B-Instruct-2512-Eagle", + # TODO: revert once figuring out OOM in CI is_available_online=False, ), "LlamaForCausalLMEagle3": _HfExamplesInfo( From 57e9bf18642a391e918400a5afc7c01221635698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 13 Dec 2025 03:49:11 +0100 Subject: [PATCH 48/57] [CI] Whisper logprobs tests (#30504) Signed-off-by: NickLucche --- tests/conftest.py | 8 +- .../multimodal/generation/test_whisper.py | 232 +++++++++--------- tests/models/registry.py | 5 +- 3 files changed, 133 insertions(+), 112 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b21cfd5ba85c4..a03f40a9a72ac 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -702,10 +702,16 @@ class HfRunner: **kwargs, ) + # Encoder-decoder models return decoder_hidden_states instead of + # hidden_states + hidden_states = ( + getattr(output, "hidden_states", None) or output.decoder_hidden_states + ) + ( seq_logprobs_lst, output_len, - ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs) + ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs) all_logprobs.append(seq_logprobs_lst) seq_ids = output.sequences[0] diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 592862c2a0bb0..b206995a9cecc 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -1,150 +1,146 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence +from typing import Any + +import librosa import pytest +from transformers import AutoModelForSpeechSeq2Seq -from vllm import SamplingParams from vllm.assets.audio import AudioAsset +from vllm.platforms import current_platform -from ....conftest import VllmRunner +from ....conftest import HfRunner, PromptAudioInput, VllmRunner from ....utils import create_new_process_for_each_test, multi_gpu_test +from ...registry import HF_EXAMPLE_MODELS +from ...utils import check_logprobs_close -PROMPTS = [ - { - "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", - "multi_modal_data": { - "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, - }, - }, - { # Test explicit encoder/decoder prompt - "encoder_prompt": { - "prompt": "", - "multi_modal_data": { - "audio": AudioAsset("winning_call").audio_and_sample_rate, - }, - }, - "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", - }, -] +VLLM_PROMPT = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" +HF_PROMPT = "" +# Whisper expects 16kHz audio +WHISPER_SAMPLE_RATE = 16000 -EXPECTED = { - "openai/whisper-tiny": [ - " He has birth words I spoke in the original corner of that. And a" - " little piece of black coat poetry. Mary had a little sandwich," - " sweet, with white and snow. And everyone had it very went the last" - " would sure to go.", - " >> And the old one, fit John the way to Edgar Martinez. >> One more" - " to line down the field line for our base camp. Here comes joy. Here" - " is June and the third base. They're going to wave him in. The throw" - " to the plate will be late. The Mariners are going to play for the" - " American League Championship. I don't believe it. It just continues" - " by all five.", - ], - "openai/whisper-small": [ - " The first words I spoke in the original pornograph. A little piece" - " of practical poetry. Mary had a little lamb, its fleece was quite a" - " slow, and everywhere that Mary went the lamb was sure to go.", - " And the old one pitch on the way to Edgar Martinez one month. Here" - " comes joy. Here is Junior to third base. They're gonna wave him" - " in. The throw to the plate will be late. The Mariners are going to" - " play for the American League Championship. I don't believe it. It" - " just continues. My, oh my.", - ], - "openai/whisper-medium": [ - " The first words I spoke in the original phonograph, a little piece" - " of practical poetry. Mary had a little lamb, its fleece was quite as" - " slow, and everywhere that Mary went the lamb was sure to go.", - " And the 0-1 pitch on the way to Edgar Martinez swung on the line" - " down the left field line for Obeyshev. Here comes Joy. Here is" - " Jorgen at third base. They're going to wave him in. The throw to the" - " plate will be late. The Mariners are going to play for the American" - " League Championship. I don't believe it. It just continues. My, oh" - " my.", - ], - "openai/whisper-large-v3": [ - " The first words I spoke in the original phonograph, a little piece" - " of practical poetry. Mary had a little lamb, its feet were quite as" - " slow, and everywhere that Mary went, the lamb was sure to go.", - " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line." - " Now the left field line for a base hit. Here comes Joy. Here is" - " Junior to third base. They're going to wave him in. The throw to the" - " plate will be late. The Mariners are going to play for the American" - " League Championship. I don't believe it. It just continues. My, oh," - " my.", - ], - "openai/whisper-large-v3-turbo": [ - " The first words I spoke in the original phonograph, a little piece" - " of practical poetry. Mary had a little lamb, its streets were quite" - " as slow, and everywhere that Mary went the lamb was sure to go.", - " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line" - " down the left field line for a base hit. Here comes Joy. Here is" - " Junior to third base. They're going to wave him in. The throw to the" - " plate will be late. The Mariners are going to play for the American" - " League Championship. I don't believe it. It just continues. My, oh," - " my.", - ], -} + +@pytest.fixture(autouse=True) +def use_spawn_for_whisper(monkeypatch): + """Whisper has issues with forked workers, use spawn instead.""" + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") def run_test( + hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], + inputs: Sequence[tuple[list[str], list[str], PromptAudioInput]], model: str, *, + max_model_len: int, + dtype: str, + max_tokens: int, + num_logprobs: int, tensor_parallel_size: int, distributed_executor_backend: str | None = None, - dtype: str = "half", + enforce_eager: bool = True, ) -> None: - prompt_list = PROMPTS * 10 - expected_list = EXPECTED[model] * 10 + """Inference result should be the same between hf and vllm. + All the audio fixtures for the test are from AudioAsset. + For huggingface runner, we provide the audio as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding MultiModalConfig as input. + """ with vllm_runner( model, dtype=dtype, - max_model_len=448, + max_model_len=max_model_len, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, - # TODO (NickLucche) figure out output differences with non-eager and re-enable - enforce_eager=True, + limit_mm_per_prompt={"audio": 2}, + enforce_eager=enforce_eager, + disable_custom_all_reduce=True, ) as vllm_model: - llm = vllm_model.llm + vllm_outputs_per_case = [ + vllm_model.generate_greedy_logprobs( + vllm_prompts, + max_tokens, + num_logprobs=num_logprobs, + audios=audios, + ) + for vllm_prompts, _, audios in inputs + ] - sampling_params = SamplingParams( - temperature=0, - top_p=1.0, - max_tokens=200, + with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model: + hf_outputs_per_case = [ + hf_model.generate_greedy_logprobs_limit( + hf_prompts, + max_tokens, + num_logprobs=num_logprobs, + audios=audios, + ) + for _, hf_prompts, audios in inputs + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", ) - outputs = llm.generate(prompt_list, sampling_params) - for output, expected in zip(outputs, expected_list): - print(output.outputs[0].text) - assert output.outputs[0].text == expected +@pytest.fixture +def input_audios() -> list[tuple[list[str], list[str], list[tuple[Any, int]]]]: + audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] + inputs = [] + for asset in audio_assets: + audio, orig_sr = asset.audio_and_sample_rate + # Resample to Whisper's expected sample rate (16kHz) + if orig_sr != WHISPER_SAMPLE_RATE: + audio = librosa.resample( + audio, orig_sr=orig_sr, target_sr=WHISPER_SAMPLE_RATE + ) + # vLLM prompts, HF prompts, audio inputs + inputs.append(([VLLM_PROMPT], [HF_PROMPT], [(audio, WHISPER_SAMPLE_RATE)])) + return inputs + + +def check_model_available(model: str) -> None: + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") @pytest.mark.core_model -@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) -@pytest.mark.parametrize("dtype", ["half"]) -@create_new_process_for_each_test() -def test_models(vllm_runner, model, dtype) -> None: - run_test( - vllm_runner, - model, - tensor_parallel_size=1, - dtype=dtype, - ) - - @pytest.mark.cpu_model @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("dtype", ["half"]) -def test_models_cpu(vllm_runner, model, dtype) -> None: - # @create_new_process_for_each_test() does not work for some runners - # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("enforce_eager", [True, False]) +@create_new_process_for_each_test("spawn") +def test_models( + hf_runner, + vllm_runner, + model: str, + dtype: str, + num_logprobs: int, + input_audios, + enforce_eager: bool, +) -> None: + check_model_available(model) + if current_platform.is_cpu() and not enforce_eager: + pytest.skip("Skipping test for CPU with non-eager mode") run_test( + hf_runner, vllm_runner, + input_audios, model, - tensor_parallel_size=1, dtype=dtype, + max_model_len=448, + max_tokens=200, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + enforce_eager=enforce_eager, ) @@ -152,15 +148,31 @@ def test_models_cpu(vllm_runner, model, dtype) -> None: @pytest.mark.core_model @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@create_new_process_for_each_test() +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [200]) +@pytest.mark.parametrize("num_logprobs", [5]) +@create_new_process_for_each_test("spawn") def test_models_distributed( + hf_runner, vllm_runner, - model, - distributed_executor_backend, + model: str, + distributed_executor_backend: str, + dtype: str, + max_tokens: int, + num_logprobs: int, + input_audios, ) -> None: + check_model_available(model) run_test( + hf_runner, vllm_runner, + input_audios, model, + dtype=dtype, + max_model_len=448, + max_tokens=max_tokens, + num_logprobs=num_logprobs, tensor_parallel_size=2, distributed_executor_backend=distributed_executor_backend, + enforce_eager=False, ) diff --git a/tests/models/registry.py b/tests/models/registry.py index 769b33d877983..ca50785b46a1a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -840,7 +840,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { is_available_online=False, ), # [Encoder-decoder] - "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), + "WhisperForConditionalGeneration": _HfExamplesInfo( + "openai/whisper-large-v3-turbo", + extras={"v3": "openai/whisper-large-v3"}, + ), # [Cross-encoder] "JinaVLForRanking": _HfExamplesInfo("jinaai/jina-reranker-m0"), } From 4fa7ce46f31cbd97b4651694caf9991cc395a259 Mon Sep 17 00:00:00 2001 From: "Roberto L. Castro" <38211239+LopezCastroRoberto@users.noreply.github.com> Date: Sat, 13 Dec 2025 04:34:23 +0100 Subject: [PATCH 49/57] [Feature] Add SM103 (Blackwell Ultra) Support to vLLM (#30484) Signed-off-by: LopezCastroRoberto Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: youkaichao --- tests/compile/distributed/test_fusions_e2e.py | 2 +- .../kernels/attention/test_cutlass_mla_decode.py | 4 ++-- .../attention/test_flashinfer_trtllm_attention.py | 4 ++-- tests/kernels/moe/test_ocp_mx_moe.py | 4 ++-- tests/quantization/test_blackwell_moe.py | 4 ++-- vllm/model_executor/layers/batch_invariant.py | 2 +- .../layers/fused_moe/batched_deep_gemm_moe.py | 5 ++++- vllm/model_executor/layers/quantization/fp8.py | 6 +++--- vllm/model_executor/layers/quantization/mxfp4.py | 8 ++++---- .../quantization/utils/flashinfer_fp4_moe.py | 2 +- .../layers/quantization/utils/flashinfer_utils.py | 2 +- .../layers/quantization/utils/fp8_utils.py | 2 +- .../layers/quantization/utils/mxfp4_utils.py | 2 +- vllm/model_executor/models/config.py | 2 +- vllm/platforms/cuda.py | 2 +- vllm/platforms/interface.py | 15 +++++++++++++++ vllm/utils/deep_gemm.py | 4 ++-- vllm/utils/flashinfer.py | 4 +++- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/attention/backends/mla/common.py | 6 +++--- vllm/v1/attention/backends/mla/flashmla_sparse.py | 4 ++-- 21 files changed, 53 insertions(+), 33 deletions(-) diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 5379b5157b811..1fcafe1840cd3 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -20,7 +20,7 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer from ...utils import flat_product, multi_gpu_test -is_blackwell = lambda: current_platform.is_device_capability(100) +is_blackwell = lambda: current_platform.is_device_capability_family(100) """Are we running on Blackwell, a lot of tests depend on it""" diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py index a60f4e385a893..784c16304a286 100644 --- a/tests/kernels/attention/test_cutlass_mla_decode.py +++ b/tests/kernels/attention/test_cutlass_mla_decode.py @@ -32,8 +32,8 @@ def cal_diff( CUTLASS_MLA_UNSUPPORTED_REASON = ( - "Cutlass MLA Requires compute capability of 10 or above." - if not current_platform.is_device_capability(100) + "Cutlass MLA Requires compute capability of 100 or above." + if not current_platform.is_device_capability_family(100) else "Cutlass MLA is supported" ) diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 98ea40608b468..06a7085a82ba0 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import ( from vllm.platforms import current_platform from vllm.utils.math_utils import round_up -if not current_platform.is_device_capability(100): +if not current_platform.is_device_capability_family(100): pytest.skip( "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True ) @@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2]) if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE: - rtol, atol = 1e-1, 2e-1 + rtol, atol = 3e-1, 4e-1 elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE: rtol, atol = 4e-2, 6e-2 elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype: diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index 5a850dda4f6fd..8fe471d124f43 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -17,7 +17,7 @@ QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse( ) >= version.parse("0.8.99") TRTLLM_GEN_MXFP4_AVAILABLE = ( - current_platform.is_cuda() and current_platform.is_device_capability(100) + current_platform.is_cuda() and current_platform.is_device_capability_family(100) ) HOPPER_MXFP4_BF16_AVAILABLE = ( @@ -799,7 +799,7 @@ def test_flashinfer_cutlass_mxfp4_fused_moe( @pytest.mark.skipif( not ( current_platform.is_cuda() - and current_platform.is_device_capability(100) + and current_platform.is_device_capability_family(100) and has_flashinfer() ), reason="NVIDIA GPU sm100 and flashinfer are required for this test", diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py index 8dd4551ff4b96..a43d2abfdd8b8 100644 --- a/tests/quantization/test_blackwell_moe.py +++ b/tests/quantization/test_blackwell_moe.py @@ -10,9 +10,9 @@ import pytest from tests.utils import RemoteOpenAIServer from vllm.platforms import current_platform -if not current_platform.is_device_capability(100): +if not current_platform.is_device_capability_family(100): pytest.skip( - "This test only runs on Blackwell GPUs (SM100).", allow_module_level=True + "This test only runs on Blackwell GPUs (SM10x).", allow_module_level=True ) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index b14e7dad77f9a..4f31e5afa1ac9 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -936,7 +936,7 @@ def enable_batch_invariant_mode(): # Batch invariant matmuls are no longer needed after cublas overrides if not is_torch_equal_or_newer("2.10.0.dev"): if ( - current_platform.is_device_capability(100) + current_platform.is_device_capability_family(100) or current_platform.is_device_capability(80) or current_platform.is_device_capability(89) ): diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 53362277dae8a..15f6e3a18ed6c 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -287,7 +287,10 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): """ DeepGemm supports packed ue8m0 activation scales format in devices == sm100 """ - return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100) + return ( + is_deep_gemm_e8m0_used() + and current_platform.is_device_capability_family(100) + ) def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: # Let PrepareAndFinalize::finalize() decide the impl. diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 60dde9eb57e0f..6909bac1efc7c 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -137,7 +137,7 @@ def get_fp8_moe_backend( if ( current_platform.is_cuda() and ( - current_platform.is_device_capability(100) + current_platform.is_device_capability_family(100) or current_platform.is_device_capability(90) ) and envs.VLLM_USE_FLASHINFER_MOE_FP8 @@ -148,7 +148,7 @@ def get_fp8_moe_backend( logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100") return Fp8MoeBackend.FLASHINFER_TRTLLM else: - if block_quant and current_platform.is_device_capability(100): + if block_quant and current_platform.is_device_capability_family(100): raise ValueError( "FlashInfer FP8 MoE throughput backend does not " "support block quantization. Please use " @@ -193,7 +193,7 @@ def get_fp8_moe_backend( # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights if ( current_platform.is_cuda() - and current_platform.is_device_capability(100) + and current_platform.is_device_capability_family(100) and block_quant ): logger.info_once( diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 6eae4e9e66e1b..e96e87d15787d 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -118,19 +118,19 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90") return Mxfp4Backend.SM90_FI_MXFP4_BF16 elif ( - current_platform.is_device_capability(100) + current_platform.is_device_capability_family(100) and has_flashinfer() and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS ): logger.info_once("Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100") return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS elif ( - current_platform.is_device_capability(100) + current_platform.is_device_capability_family(100) and has_flashinfer() and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 ): return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM - elif current_platform.is_device_capability(100) and has_flashinfer(): + elif current_platform.is_device_capability_family(100) and has_flashinfer(): logger.info_once( "Using FlashInfer MXFP4 BF16 backend for SM100, " "For faster performance on SM100, consider setting " @@ -139,7 +139,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: ) return Mxfp4Backend.SM100_FI_MXFP4_BF16 elif ( - current_platform.is_device_capability(100) + current_platform.is_device_capability_family(100) or current_platform.is_device_capability(90) ) and not has_flashinfer(): logger.warning_once( diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 8f96222f19f20..e424cd0e1ac99 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -50,7 +50,7 @@ def is_flashinfer_fp4_cutedsl_moe_available() -> bool: envs.VLLM_USE_FLASHINFER_MOE_FP4 and has_flashinfer_cutedsl_grouped_gemm_nt_masked() and current_platform.is_cuda() - and current_platform.is_device_capability(100) + and current_platform.is_device_capability_family(100) ) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index ba3653e4b5ea7..09d0fe6a2f3ad 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -290,7 +290,7 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend: if flashinfer_moe_backend in backend_map: if ( flashinfer_moe_backend == "latency" - and not current_platform.has_device_capability(100) + and not current_platform.is_device_capability_family(100) ): logger.info_once( "Flashinfer TRTLLM MOE backend is only supported on " diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 9eeb6e266c34e..ea68745585160 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -247,7 +247,7 @@ class W8A8BlockFp8LinearOp: self.act_quant_group_shape = act_quant_group_shape self.is_deep_gemm_supported = is_deep_gemm_supported() self.is_hopper = current_platform.is_device_capability(90) - self.is_blackwell = current_platform.is_device_capability(100) + self.is_blackwell = current_platform.is_device_capability_family(100) self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used() # Get the correct blockscale mul and input quant operations. diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 7a351afb3c415..e9ecf0547033d 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -63,7 +63,7 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): "split_k": 1, } opt_flags.update_opt_flags_constraints(constraints) - elif current_platform.is_device_capability(100): + elif current_platform.is_device_capability_family(100): constraints = { "is_persistent": True, "epilogue_subtile": 1, diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 06cc92ee88180..4b08472538db4 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -363,7 +363,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig): else: kernel_block_alignment_size = 16 if ( - current_platform.is_device_capability(100) + current_platform.is_device_capability_family(100) and model_config.get_head_size() == 256 and ( attention_config.backend is None diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 915392a4125f9..ef33e64bbfdf4 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -182,7 +182,7 @@ class CudaPlatformBase(Platform): if vllm_config.attention_config.backend is None: # Default case - if cls.is_device_capability(100) and not use_sparse: + if cls.is_device_capability_family(100) and not use_sparse: # Blackwell => Force CutlassMLA (unless sparse, i.e. DSv3.2). use_cutlass_mla = True # Set the backend in AttentionConfig so it's used during diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index f04e94e425257..49437c7d56d12 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -301,6 +301,21 @@ class Platform: return current_capability.to_int() == capability + @classmethod + def is_device_capability_family( + cls, + capability: int, + device_id: int = 0, + ) -> bool: + """ + Returns True if the device capability is any .x. + Mirrors CUDA 13 'family' architecture semantics (e.g. 10.x, 11.x, 12.x). + """ + current_capability = cls.get_device_capability(device_id=device_id) + if current_capability is None: + return False + return (current_capability.to_int() // 10) == (capability // 10) + @classmethod def get_device_name(cls, device_id: int = 0) -> str: """Get the name of a device.""" diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index a099fde1bdc45..46be3e2cd5c54 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -38,7 +38,7 @@ class DeepGemmQuantScaleFMT(Enum): return DeepGemmQuantScaleFMT.FLOAT32 return ( DeepGemmQuantScaleFMT.UE8M0 - if current_platform.is_device_capability(100) + if current_platform.is_device_capability_family(100) else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0 ) @@ -50,7 +50,7 @@ def is_deep_gemm_supported() -> bool: """ is_supported_arch = current_platform.is_cuda() and ( current_platform.is_device_capability(90) - or current_platform.is_device_capability(100) + or current_platform.is_device_capability_family(100) ) return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 9a66049350cd8..5019b771f4a14 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -264,7 +264,9 @@ def supports_trtllm_attention() -> bool: return False # Requires SM100 and NVIDIA artifactory to be accessible to download cubins - return current_platform.is_device_capability(100) and has_nvidia_artifactory() + return ( + current_platform.is_device_capability_family(100) and has_nvidia_artifactory() + ) def force_use_trtllm_attention() -> bool | None: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 4174b80ee312e..2740a6916fd97 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -564,7 +564,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) self.paged_kv_last_page_len_np = self.paged_kv_last_page_len_cpu.numpy() - if self.head_dim == 256 and current_platform.is_device_capability(100): + if self.head_dim == 256 and current_platform.is_device_capability_family(100): # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that # head size 256 and block size 16 is not supported on blackwell. assert kv_cache_spec.block_size != 16, ( diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 8265503c28c35..fea482493635f 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -446,7 +446,7 @@ def use_flashinfer_prefill() -> bool: and flashinfer_available and not vllm_config.attention_config.use_cudnn_prefill and not vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill - and current_platform.is_device_capability(100) + and current_platform.is_device_capability_family(100) ) @@ -457,7 +457,7 @@ def use_cudnn_prefill() -> bool: return ( flashinfer_available and vllm_config.attention_config.use_cudnn_prefill - and current_platform.is_device_capability(100) + and current_platform.is_device_capability_family(100) and has_nvidia_artifactory() ) @@ -470,7 +470,7 @@ def use_trtllm_ragged_deepseek_prefill() -> bool: return ( flashinfer_available and vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill - and current_platform.is_device_capability(100) + and current_platform.is_device_capability_family(100) ) diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index f3052fbaf2a65..0818078da0364 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -420,7 +420,7 @@ class FlashMLASparseMetadataBuilder(AttentionMetadataBuilder[FlashMLASparseMetad max_num_sm_parts = int( max((sm_count // 2) / h_k // (cdiv(h_q // h_k, 2 * 64) * s_q), 1) ) - if current_platform.is_device_capability(100): + if current_platform.is_device_capability_family(100): max_num_sm_parts *= 2 self.tile_scheduler_metadata_buffer = torch.empty( # TileSchedulerMetaDataSize = 8 @@ -719,7 +719,7 @@ class FlashMLASparseImpl(MLACommonBaseImpl[FlashMLASparseMetadata]): self.softmax_scale = scale assert indexer is not None self.topk_indices_buffer = indexer.topk_indices_buffer - self.padding = 128 if current_platform.is_device_capability(100) else 64 + self.padding = 128 if current_platform.is_device_capability_family(100) else 64 if kv_cache_dtype == "fp8_ds_mla": # Reserve workspace during initialization From fdc135d768267b3a0ae8ed6fc3eca6a68d75f7a6 Mon Sep 17 00:00:00 2001 From: Tsukasa OI Date: Sat, 13 Dec 2025 14:55:14 +0900 Subject: [PATCH 50/57] [Misc][Quantization] Clarify the intent of GGUF `FusedMoE` weight materialization (#30310) Signed-off-by: Tsukasa OI --- vllm/model_executor/layers/fused_moe/layer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 7f803720d4770..eba6ab4cc35f7 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1200,10 +1200,14 @@ class FusedMoE(CustomOp): if full_load: shard_dim += 1 - # Materialize GGUF UninitializedParameter + # Materialize GGUF UninitializedParameter accounting merged weights if is_gguf_weight and isinstance(param, UninitializedParameter): + # To materialize a tensor, we must have full shape including + # number of experts, making this portion to require `full_load`. + assert full_load final_shape = list(loaded_weight.shape) - if shard_id in ["w1", "w3"]: + # w1 and w3 are merged per expert. + if shard_id in {"w1", "w3"}: final_shape[1] *= 2 final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size param.materialize(final_shape, dtype=loaded_weight.dtype) From b09806e28ffcc3e63176d668b2b3e965b35c986c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 13 Dec 2025 15:48:56 +0800 Subject: [PATCH 51/57] [Bugfix] Dictionary MM embeddings for online chat (#30507) Signed-off-by: DarkLight1337 --- tests/entrypoints/test_chat_utils.py | 110 +++++++++++++++++++++++++-- vllm/entrypoints/chat_utils.py | 97 ++++++++++++++++------- vllm/v1/engine/input_processor.py | 30 +++++--- 3 files changed, 193 insertions(+), 44 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 527322c71ae4b..40059c9041541 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -796,9 +796,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( "content": "<|image_1|>\nWhat's in this image?", } ] + assert mm_data is not None assert "image" in mm_data - assert mm_data["image"] is None + assert isinstance(mm_data["image"], list) + assert len(mm_data["image"]) == 1 + assert mm_data["image"][0] is None + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) @@ -825,10 +829,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( # Should have audio in mm_data as None (UUID provided) assert mm_data is not None assert "audio" in mm_data - assert mm_data["audio"] is None + assert isinstance(mm_data["audio"], list) + assert len(mm_data["audio"]) == 1 + assert mm_data["audio"][0] is None + # UUID should be recorded - assert mm_uuids is not None - assert "audio" in mm_uuids _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid]) @@ -1121,10 +1126,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( mm_data = await mm_future assert mm_data is not None assert "image" in mm_data - assert mm_data["image"] is None + assert isinstance(mm_data["image"], list) + assert len(mm_data["image"]) == 1 + assert mm_data["image"][0] is None + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) +def test_parse_chat_messages_empty_dict_image_embeds( + phi3v_model_config_image_embeds, +): + """Test that empty dictionary for image_embeds is handled without errors.""" + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": "user", + "content": [ + {"type": "image_embeds", "image_embeds": {}}, + {"type": "text", "text": "What's in this image?"}, + ], + } + ], + phi3v_model_config_image_embeds, + content_format="string", + ) + + # Verify conversation structure + assert conversation == [ + { + "role": "user", + "content": "<|image_1|>\nWhat's in this image?", + } + ] + + # Verify mm_data contains an empty dictionary of embeddings + assert mm_data is not None + assert "image" in mm_data + assert isinstance(mm_data["image"], dict) + assert len(mm_data["image"]) == 0 + + # Verify UUIDs (None since we didn't provide any) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) + + +def test_parse_chat_messages_multiple_dict_image_embeds( + phi3v_model_config_image_embeds, +): + """Test that multiple dictionaries for image_embeds is handled without errors.""" + # Create two sample image embedding tensors + batch_size = 2 + image_embedding_1 = torch.randn(batch_size, 256, 1024) + image_embedding_2 = torch.randn(batch_size, 3) + + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": "user", + "content": [ + { + "type": "image_embeds", + "image_embeds": { + "image_embedding_1": tensor2base64(p), + "image_embedding_2": tensor2base64(i), + }, + } + for p, i in zip(image_embedding_1, image_embedding_2) + ] + + [ + {"type": "text", "text": "Describe these two images."}, + ], + } + ], + phi3v_model_config_image_embeds, + content_format="string", + ) + + # Verify conversation structure + assert conversation == [ + { + "role": "user", + "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.", + } + ] + + # Verify mm_data contains a dictionary of multi-embeddings + assert mm_data is not None + assert "image" in mm_data + assert isinstance(mm_data["image"], dict) + assert len(mm_data["image"]) == batch_size + + # Verify each embedding has the correct shape + assert isinstance(mm_data["image"]["image_embedding_1"], torch.Tensor) + assert mm_data["image"]["image_embedding_1"].shape == image_embedding_1.shape + assert isinstance(mm_data["image"]["image_embedding_2"], torch.Tensor) + assert mm_data["image"]["image_embedding_2"].shape == image_embedding_2.shape + + # Verify UUIDs (None since we didn't provide any) + _assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None]) + + @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_async( phi3v_model_config, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aceaa8bd45b81..5a15dec6f84c1 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -9,7 +9,7 @@ from collections import Counter, defaultdict, deque from collections.abc import Awaitable, Callable, Iterable from functools import cached_property, lru_cache, partial from pathlib import Path -from typing import Any, Generic, Literal, TypeAlias, TypeVar, cast +from typing import TYPE_CHECKING, Any, Generic, Literal, TypeAlias, TypeVar, cast import jinja2 import jinja2.ext @@ -53,7 +53,14 @@ from vllm.tokenizers import MistralTokenizer, TokenizerLike from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import random_uuid +from vllm.utils.collection_utils import is_list_of from vllm.utils.func_utils import supports_kw +from vllm.utils.import_utils import LazyLoader + +if TYPE_CHECKING: + import torch +else: + torch = LazyLoader("torch", globals(), "torch") logger = init_logger(__name__) @@ -620,6 +627,44 @@ ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"] _T = TypeVar("_T") +def _extract_embeds(tensors: list[torch.Tensor]): + if len(tensors) == 0: + return tensors + + if len(tensors) == 1: + tensors[0]._is_single_item = True # type: ignore + return tensors[0] # To keep backwards compatibility for single item input + + first_shape = tensors[0].shape + if all(t.shape == first_shape for t in tensors): + return torch.stack(tensors) + + return tensors + + +def _get_embeds_data(items_by_modality: dict[str, list[Any]], modality: str): + embeds_key = f"{modality}_embeds" + embeds = items_by_modality[embeds_key] + + if len(embeds) == 0: + return embeds + if is_list_of(embeds, torch.Tensor): + return _extract_embeds(embeds) + if is_list_of(embeds, dict): + if not embeds: + return {} + + first_keys = set(embeds[0].keys()) + if any(set(item.keys()) != first_keys for item in embeds[1:]): + raise ValueError( + "All dictionaries in the list of embeddings must have the same keys." + ) + + return {k: _extract_embeds([item[k] for item in embeds]) for k in first_keys} + + return embeds + + class BaseMultiModalItemTracker(ABC, Generic[_T]): """ Tracks multi-modal items in a given request and ensures that the number @@ -688,11 +733,14 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): def all_mm_uuids(self) -> MultiModalUUIDDict | None: if not self._items_by_modality: return None - mm_uuids = {} + uuids_by_modality = dict(self._uuids_by_modality) if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality: raise ValueError("Mixing raw image and embedding inputs is not allowed") + if "audio" in uuids_by_modality and "audio_embeds" in uuids_by_modality: + raise ValueError("Mixing raw audio and embedding inputs is not allowed") + mm_uuids = {} if "image_embeds" in uuids_by_modality: mm_uuids["image"] = uuids_by_modality["image_embeds"] if "image" in uuids_by_modality: @@ -703,6 +751,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): mm_uuids["audio"] = uuids_by_modality["audio"] # UUIDs of audios if "video" in uuids_by_modality: mm_uuids["video"] = uuids_by_modality["video"] # UUIDs of videos + return mm_uuids @abstractmethod @@ -714,29 +763,25 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[object]): def all_mm_data(self) -> MultiModalDataDict | None: if not self._items_by_modality: return None - mm_inputs = {} + items_by_modality = dict(self._items_by_modality) if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError("Mixing raw image and embedding inputs is not allowed") if "audio" in items_by_modality and "audio_embeds" in items_by_modality: raise ValueError("Mixing raw audio and embedding inputs is not allowed") + mm_inputs = {} if "image_embeds" in items_by_modality: - image_embeds_lst = items_by_modality["image_embeds"] - mm_inputs["image"] = ( - image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0] - ) + mm_inputs["image"] = _get_embeds_data(items_by_modality, "image") if "image" in items_by_modality: mm_inputs["image"] = items_by_modality["image"] # A list of images if "audio_embeds" in items_by_modality: - audio_embeds_lst = items_by_modality["audio_embeds"] - mm_inputs["audio"] = ( - audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0] - ) + mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio") if "audio" in items_by_modality: mm_inputs["audio"] = items_by_modality["audio"] # A list of audios if "video" in items_by_modality: mm_inputs["video"] = items_by_modality["video"] # A list of videos + return mm_inputs def create_parser(self) -> "BaseMultiModalContentParser": @@ -747,38 +792,32 @@ class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): async def all_mm_data(self) -> MultiModalDataDict | None: if not self._items_by_modality: return None - mm_inputs = {} - items_by_modality = {} - for modality, items in self._items_by_modality.items(): - coros = [] - for item in items: - if item is not None: - coros.append(item) - else: - coros.append(asyncio.sleep(0)) - items_by_modality[modality] = await asyncio.gather(*coros) + coros_by_modality = { + modality: [item or asyncio.sleep(0) for item in items] + for modality, items in self._items_by_modality.items() + } + items_by_modality: dict[str, list[object | None]] = { + modality: await asyncio.gather(*coros) + for modality, coros in coros_by_modality.items() + } if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError("Mixing raw image and embedding inputs is not allowed") if "audio" in items_by_modality and "audio_embeds" in items_by_modality: raise ValueError("Mixing raw audio and embedding inputs is not allowed") + mm_inputs = {} if "image_embeds" in items_by_modality: - image_embeds_lst = items_by_modality["image_embeds"] - mm_inputs["image"] = ( - image_embeds_lst if len(image_embeds_lst) != 1 else image_embeds_lst[0] - ) + mm_inputs["image"] = _get_embeds_data(items_by_modality, "image") if "image" in items_by_modality: mm_inputs["image"] = items_by_modality["image"] # A list of images if "audio_embeds" in items_by_modality: - audio_embeds_lst = items_by_modality["audio_embeds"] - mm_inputs["audio"] = ( - audio_embeds_lst if len(audio_embeds_lst) != 1 else audio_embeds_lst[0] - ) + mm_inputs["audio"] = _get_embeds_data(items_by_modality, "audio") if "audio" in items_by_modality: mm_inputs["audio"] = items_by_modality["audio"] # A list of audios if "video" in items_by_modality: mm_inputs["video"] = items_by_modality["video"] # A list of videos + return mm_inputs def create_parser(self) -> "BaseMultiModalContentParser": diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index a3c18464d3f52..5bd18cc064cb5 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -188,29 +188,39 @@ class InputProcessor: def _validate_single_prompt(single_prompt: dict | str) -> None: if not isinstance(single_prompt, dict): return + mm_data = single_prompt.get("multi_modal_data") mm_uuids = single_prompt.get("multi_modal_uuids") if not mm_data or not mm_uuids: return + import torch + + def _get_len(items: object): + if isinstance(items, dict): # Embedding inputs + return _get_len(next(iter(items.values()))) if items else 1 + + if isinstance(items, list): + return len(items) + if isinstance(items, torch.Tensor): + # To keep backwards compatibility for single item embedding input + return 1 if getattr(items, "_is_single_item", False) else len(items) + + return 1 + for modality, items in mm_data.items(): if modality in mm_uuids: - data_len = len(items) if isinstance(items, list) else 1 - uuid_len = ( - len(mm_uuids[modality]) - if isinstance(mm_uuids[modality], list) - else 1 - ) + data_len = _get_len(items) + uuid_len = _get_len(mm_uuids[modality]) if uuid_len != data_len: raise ValueError( - f"multi_modal_uuids for modality '{modality}' " + f"multi_modal_uuids for modality {modality!r} " "must have same length as data: got " - f"{uuid_len} uuids vs " - f"{data_len} items." + f"{uuid_len} uuids vs {data_len} items." ) else: raise ValueError( - f"multi_modal_uuids for modality '{modality}' must " + f"multi_modal_uuids for modality {modality!r} must " "be provided if multi_modal_data is provided." ) From 1cec5b7ea9ba72b34de9a7c7001beb8a1b8f0dc0 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 13 Dec 2025 01:45:26 -0800 Subject: [PATCH 52/57] [Scheduer] Simplify stop checking for pooling models (#30591) Signed-off-by: Nick Hill --- vllm/v1/core/sched/scheduler.py | 11 +++++------ vllm/v1/core/sched/utils.py | 12 ++---------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a9ce6e63cc775..278970ae7ee88 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1117,6 +1117,7 @@ class Scheduler(SchedulerInterface): stopped = False new_logprobs = None new_token_ids = generated_token_ids + pooler_output = pooler_outputs[req_index] if pooler_outputs else None kv_transfer_params = None status_before_stop = request.status @@ -1125,12 +1126,10 @@ class Scheduler(SchedulerInterface): new_token_ids, stopped = self._update_request_with_output( request, new_token_ids ) - - # Stop checking for pooler models. - pooler_output = None - if pooler_outputs: - pooler_output = pooler_outputs[req_index] - stopped = check_stop(request, self.max_model_len, pooler_output) + elif request.pooling_params and pooler_output is not None: + # Pooling stops as soon as there is output. + request.status = RequestStatus.FINISHED_STOPPED + stopped = True if stopped: kv_transfer_params = self._free_request(request) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 82166dc978396..6319731883225 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -2,8 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib -import torch - from vllm.v1.request import Request, RequestStatus @@ -39,14 +37,8 @@ def remove_all(lst: list, items_to_remove: set) -> list: return [item for item in lst if item not in items_to_remove] -def check_stop( - request: Request, max_model_len: int, pooler_output: torch.Tensor | None = None -) -> bool: - if request.pooling_params: - if pooler_output is not None: - request.status = RequestStatus.FINISHED_STOPPED - return True - return False +def check_stop(request: Request, max_model_len: int) -> bool: + assert not request.pooling_params sampling_params = request.sampling_params assert sampling_params is not None From 64251f48df0ed16fb67f12ece26ab6c7ea730e74 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 13 Dec 2025 20:42:39 +0800 Subject: [PATCH 53/57] [Chore] Adjust tokenizer import to avoid circular imports (#30601) Signed-off-by: DarkLight1337 --- benchmarks/backend_request_func.py | 2 +- tests/entrypoints/openai/test_serving_engine.py | 2 +- tests/entrypoints/test_chat_utils.py | 3 ++- tests/models/language/generation/test_mistral.py | 2 +- tests/models/multimodal/generation/test_voxtral.py | 2 +- tests/models/multimodal/processing/test_common.py | 7 ++----- tests/reasoning/test_mistral_reasoning_parser.py | 2 +- tests/reasoning/utils.py | 2 +- tests/tokenizers_/test_detokenize.py | 2 +- tests/tool_use/test_mistral_tool_parser.py | 7 ++----- vllm/entrypoints/chat_utils.py | 3 ++- vllm/entrypoints/llm.py | 3 ++- vllm/entrypoints/openai/serving_engine.py | 4 +++- vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py | 3 ++- vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py | 3 ++- .../entrypoints/openai/tool_parsers/mistral_tool_parser.py | 5 ++++- vllm/entrypoints/pooling/score/serving.py | 3 ++- vllm/entrypoints/utils.py | 2 +- vllm/model_executor/models/pixtral.py | 3 ++- vllm/model_executor/models/voxtral.py | 3 ++- vllm/reasoning/mistral_reasoning_parser.py | 2 +- vllm/v1/engine/input_processor.py | 3 ++- vllm/v1/structured_output/backend_xgrammar.py | 3 ++- 23 files changed, 40 insertions(+), 31 deletions(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index d69d74ca61f54..831b76b66e096 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -620,7 +620,7 @@ def get_tokenizer( kwargs["use_fast"] = False if tokenizer_mode == "mistral": try: - from vllm.tokenizers import MistralTokenizer + from vllm.tokenizers.mistral import MistralTokenizer except ImportError as e: raise ImportError( "MistralTokenizer requires vllm package.\n" diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py index 956a06dc5487c..192c7cafb7493 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_engine.py @@ -10,7 +10,7 @@ import pytest from vllm.config import ModelConfig from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers.mistral import MistralTokenizer @pytest.fixture() diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 40059c9041541..a87a4c35d3dc7 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -29,7 +29,8 @@ from vllm.multimodal.utils import ( encode_image_base64, encode_video_base64, ) -from vllm.tokenizers import MistralTokenizer, get_tokenizer +from vllm.tokenizers import get_tokenizer +from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.serial_utils import tensor2base64 from ..models.registry import HF_EXAMPLE_MODELS diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index e2d6271e2faed..bc8bb05c284e6 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -10,7 +10,7 @@ from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( MistralToolParser, ) from vllm.sampling_params import SamplingParams -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers.mistral import MistralTokenizer from ...utils import check_logprobs_close diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py index 9e9087cb0fc4d..0eaef49e2395c 100644 --- a/tests/models/multimodal/generation/test_voxtral.py +++ b/tests/models/multimodal/generation/test_voxtral.py @@ -9,7 +9,7 @@ from mistral_common.audio import Audio from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk from mistral_common.protocol.instruct.messages import UserMessage -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers.mistral import MistralTokenizer from ....conftest import AudioTestAssets from ....utils import RemoteOpenAIServer diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 2e032ac4ca526..67861ebfc44e4 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -22,11 +22,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext -from vllm.tokenizers import ( - MistralTokenizer, - TokenizerLike, - cached_tokenizer_from_config, -) +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config +from vllm.tokenizers.mistral import MistralTokenizer from ....multimodal.utils import random_audio, random_image, random_video from ...registry import ( diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py index 01592fd0782a9..d6da723f80b08 100644 --- a/tests/reasoning/test_mistral_reasoning_parser.py +++ b/tests/reasoning/test_mistral_reasoning_parser.py @@ -5,7 +5,7 @@ import pytest from tests.reasoning.utils import run_reasoning_extraction_mistral from vllm.reasoning import ReasoningParser, ReasoningParserManager -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers.mistral import MistralTokenizer parser_name = "mistral" diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py index 695312a0cadfe..a020fb8e97161 100644 --- a/tests/reasoning/utils.py +++ b/tests/reasoning/utils.py @@ -4,7 +4,7 @@ from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage from vllm.reasoning import ReasoningParser -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers.mistral import MistralTokenizer class StreamingReasoningReconstructor: diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py index ae1d6b0956722..d307993d04df9 100644 --- a/tests/tokenizers_/test_detokenize.py +++ b/tests/tokenizers_/test_detokenize.py @@ -8,7 +8,7 @@ import pytest from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.sampling_params import SamplingParams -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers.mistral import MistralTokenizer from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.detokenizer import ( FastIncrementalDetokenizer, diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py index 2dd0399cb8eeb..d498863317e8d 100644 --- a/tests/tool_use/test_mistral_tool_parser.py +++ b/tests/tool_use/test_mistral_tool_parser.py @@ -13,12 +13,9 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.openai.protocol import DeltaMessage, DeltaToolCall from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolParser -from vllm.tokenizers import ( - MistralTokenizer, - TokenizerLike, - get_tokenizer, -) +from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers.detokenizer_utils import detokenize_incrementally +from vllm.tokenizers.mistral import MistralTokenizer @pytest.fixture(scope="module") diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 5a15dec6f84c1..6a8dfe3cd9e38 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -49,7 +49,8 @@ from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector -from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.mistral import MistralTokenizer from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import random_uuid diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 6440b702f4fa6..31319cf64aeb8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -72,7 +72,8 @@ from vllm.platforms import current_platform from vllm.pooling_params import PoolingParams from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams from vllm.tasks import PoolingTask -from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.mistral import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils.collection_utils import as_iter, is_list_of from vllm.utils.counter import Counter diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index a799432baeb40..d83a7c8d59f39 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -117,7 +117,9 @@ from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import BeamSearchParams, SamplingParams -from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer, TokenizerLike +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer +from vllm.tokenizers.mistral import MistralTokenizer from vllm.tracing import ( contains_trace_headers, extract_trace_headers, diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 19c1c83268ed4..14cf2f38b70cc 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -22,7 +22,8 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.mistral import MistralTokenizer logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 4655da8dd4542..92b09917c2521 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -21,7 +21,8 @@ from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff from vllm.logger import init_logger -from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.mistral import MistralTokenizer logger = init_logger(__name__) diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index bc827f045606c..f60c379d26711 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -6,6 +6,7 @@ from collections.abc import Sequence from enum import Enum, auto from random import choices from string import ascii_letters, digits +from typing import Any import ijson import regex as re @@ -24,7 +25,8 @@ from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ) from vllm.logger import init_logger -from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.mistral import MistralTokenizer logger = init_logger(__name__) @@ -84,6 +86,7 @@ class MistralToolParser(ToolParser): # initialize properties used for state when parsing tool calls in # streaming mode + self.prev_tool_call_arr: list[dict[str, Any]] = [] self.current_tool_id: int = -1 self.streaming_state: StreamingState = StreamingState.WAITING_FOR_TOOL_START diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index e5a66783005a6..f574d8bcebb40 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -38,7 +38,8 @@ from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput -from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.async_utils import make_async, merge_async_iterators logger = init_logger(__name__) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index daeeb995bc749..f4a633c69cb0b 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -30,7 +30,7 @@ from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.argparse_utils import FlexibleArgumentParser logger = init_logger(__name__) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index faf2d80d24bba..555e6ea4b8cb2 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -59,7 +59,8 @@ from vllm.multimodal.processing import ( from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config +from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 7b408248ec74c..331f0c54ecfbc 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -51,7 +51,8 @@ from vllm.multimodal.processing import ( ) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.tokenizers import MistralTokenizer, cached_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config +from vllm.tokenizers.mistral import MistralTokenizer from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription from .utils import init_vllm_registered_model, maybe_prefix diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index 3206dbb29fe2e..de3d1296ec734 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -10,7 +10,7 @@ from vllm.entrypoints.openai.protocol import ( from vllm.logger import init_logger from vllm.reasoning import ReasoningParser from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers.mistral import MistralTokenizer logger = init_logger(__name__) diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 5bd18cc064cb5..65e0c845b0afa 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -19,7 +19,8 @@ from vllm.multimodal.processing import EncDecMultiModalProcessor from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams -from vllm.tokenizers import MistralTokenizer, TokenizerLike +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 826ee08caa4e2..c5e7165026d1b 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -10,7 +10,8 @@ import torch import vllm.envs from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer +from vllm.tokenizers.deepseekv32 import DeepseekV32Tokenizer +from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.import_utils import LazyLoader from vllm.v1.structured_output.backend_types import ( StructuredOutputBackend, From e5db3e2774fd16394f8a96a608263ff2416385c8 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 13 Dec 2025 20:43:01 +0800 Subject: [PATCH 54/57] [CI/Build] Fix broken mm processor test Mistral-3-large (#30597) Signed-off-by: Isotr0py --- tests/models/multimodal/processing/test_tensor_schema.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 5d489549c5b46..cb875436857cf 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -8,6 +8,7 @@ from typing import Any, TypeAlias import numpy as np import pytest +import torch import torch.nn as nn from PIL import Image @@ -35,6 +36,7 @@ from vllm.tokenizers import cached_tokenizer_from_config from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype +from ....utils import create_new_process_for_each_test from ...registry import HF_EXAMPLE_MODELS from ...utils import dummy_hf_overrides from .test_common import get_model_ids_to_test, get_text_token_prompts @@ -136,6 +138,7 @@ def create_batched_mm_kwargs( ) +# TODO(Isotr0py): Don't initalize model during test @contextmanager def initialize_dummy_model( model_cls: type[nn.Module], @@ -150,16 +153,21 @@ def initialize_dummy_model( backend="nccl", ) initialize_model_parallel(tensor_model_parallel_size=1) + + current_device = torch.get_default_device() vllm_config = VllmConfig(model_config=model_config) with set_current_vllm_config(vllm_config=vllm_config): with set_default_torch_dtype(model_config.dtype): + torch.set_default_device(current_platform.device_type) model = model_cls(vllm_config=vllm_config) + torch.set_default_device(current_device) yield model del model cleanup_dist_env_and_memory() +@create_new_process_for_each_test() @pytest.mark.parametrize("model_id", get_model_ids_to_test()) def test_model_tensor_schema(model_id: str): model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) From ace34e3783208a31b185968a1e92c79ac8f633cb Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Sat, 13 Dec 2025 06:12:45 -0800 Subject: [PATCH 55/57] [Bugfix] Qwen3-next with --hf-overrides \{\"num_hidden_layers\":8\} (#30433) Signed-off-by: Chen Zhang --- vllm/model_executor/models/qwen3_next.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 6a5447ad0fed4..ccf6cc6e5894b 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -1092,6 +1092,8 @@ class Qwen3NextModel(nn.Module): name.endswith(".bias") or name.endswith("_bias") ) and name not in params_dict: continue + if name not in params_dict: + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader( @@ -1108,6 +1110,11 @@ class Qwen3NextModel(nn.Module): continue if is_pp_missing_parameter(name, self): continue + if name not in params_dict: + logger.warning_once( + f"Parameter {name} not found in params_dict, skip loading" + ) + continue param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader From 39cefbdf17e2e906e0eae3e82bd601f66137deb4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 13 Dec 2025 23:16:22 +0800 Subject: [PATCH 56/57] [Refactor] `TokenizerRegistry` only uses lazy imports (#30609) Signed-off-by: DarkLight1337 --- tests/test_inputs.py | 4 +- tests/tokenizers_/test_basic.py | 43 +++--- tests/tokenizers_/test_registry.py | 23 ++- vllm/entrypoints/chat_utils.py | 5 +- vllm/tokenizers/__init__.py | 6 - vllm/tokenizers/deepseekv32.py | 47 ++++-- vllm/tokenizers/hf.py | 19 +-- vllm/tokenizers/mistral.py | 7 +- vllm/tokenizers/protocol.py | 2 +- vllm/tokenizers/registry.py | 202 +++++++++++++------------- vllm/transformers_utils/tokenizer.py | 6 +- vllm/v1/engine/async_llm.py | 4 +- vllm/v1/engine/llm_engine.py | 4 +- vllm/v1/structured_output/__init__.py | 4 +- 14 files changed, 201 insertions(+), 175 deletions(-) diff --git a/tests/test_inputs.py b/tests/test_inputs.py index c4339827de8b6..8351af2528e4b 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -7,7 +7,7 @@ from vllm.config import ModelConfig from vllm.inputs import zip_enc_dec_prompts from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.preprocess import InputPreprocessor -from vllm.tokenizers import init_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config pytestmark = pytest.mark.cpu_test @@ -108,7 +108,7 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - tokenizer = init_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(model_config) input_preprocessor = InputPreprocessor(model_config, tokenizer) # HF processor adds sep token diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py index b152227a5a50f..0510261eacde7 100644 --- a/tests/tokenizers_/test_basic.py +++ b/tests/tokenizers_/test_basic.py @@ -3,38 +3,39 @@ from typing import _get_protocol_attrs # type: ignore import pytest -from transformers import PreTrainedTokenizerBase +from transformers import ( + PreTrainedTokenizer, + PreTrainedTokenizerBase, + PreTrainedTokenizerFast, +) from vllm.tokenizers import TokenizerLike, get_tokenizer +from vllm.tokenizers.mistral import MistralTokenizer def _get_missing_attrs(obj: object, target: type): return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)] +def _assert_tokenizer_like(tokenizer: object): + missing_attrs = _get_missing_attrs(tokenizer, TokenizerLike) + assert not missing_attrs, f"Missing attrs: {missing_attrs}" + + def test_tokenizer_like_protocol(): - assert not ( - missing_attrs := _get_missing_attrs( - get_tokenizer("gpt2", use_fast=False), - TokenizerLike, - ) - ), f"Missing attrs: {missing_attrs}" + tokenizer = get_tokenizer("gpt2", use_fast=False) + assert isinstance(tokenizer, PreTrainedTokenizer) + _assert_tokenizer_like(tokenizer) - assert not ( - missing_attrs := _get_missing_attrs( - get_tokenizer("gpt2", use_fast=True), - TokenizerLike, - ) - ), f"Missing attrs: {missing_attrs}" + tokenizer = get_tokenizer("gpt2", use_fast=True) + assert isinstance(tokenizer, PreTrainedTokenizerFast) + _assert_tokenizer_like(tokenizer) - assert not ( - missing_attrs := _get_missing_attrs( - get_tokenizer( - "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral" - ), - TokenizerLike, - ) - ), f"Missing attrs: {missing_attrs}" + tokenizer = get_tokenizer( + "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral" + ) + assert isinstance(tokenizer, MistralTokenizer) + _assert_tokenizer_like(tokenizer) @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"]) diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py index 7e795350d64c8..546f38b078dde 100644 --- a/tests/tokenizers_/test_registry.py +++ b/tests/tokenizers_/test_registry.py @@ -2,7 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path -from vllm.tokenizers import TokenizerLike, TokenizerRegistry, get_tokenizer +import pytest + +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.registry import ( + TokenizerRegistry, + get_tokenizer, + resolve_tokenizer_args, +) class TestTokenizer(TokenizerLike): @@ -40,10 +47,22 @@ class TestTokenizer(TokenizerLike): return True +@pytest.mark.parametrize("runner_type", ["generate", "pooling"]) +def test_resolve_tokenizer_args_idempotent(runner_type): + tokenizer_mode, tokenizer_name, args, kwargs = resolve_tokenizer_args( + "facebook/opt-125m", + runner_type=runner_type, + ) + + assert (tokenizer_mode, tokenizer_name, args, kwargs) == resolve_tokenizer_args( + tokenizer_name, *args, **kwargs + ) + + def test_customized_tokenizer(): TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__) - tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc") + tokenizer = TokenizerRegistry.load_tokenizer("test_tokenizer", "abc") assert isinstance(tokenizer, TestTokenizer) assert tokenizer.path_or_repo_id == "abc" assert tokenizer.bos_token_id == 0 diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 6a8dfe3cd9e38..8485022024a4f 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -50,7 +50,6 @@ from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector from vllm.tokenizers import TokenizerLike -from vllm.tokenizers.mistral import MistralTokenizer from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import random_uuid @@ -60,6 +59,8 @@ from vllm.utils.import_utils import LazyLoader if TYPE_CHECKING: import torch + + from vllm.tokenizers.mistral import MistralTokenizer else: torch = LazyLoader("torch", globals(), "torch") @@ -1832,7 +1833,7 @@ def apply_hf_chat_template( def apply_mistral_chat_template( - tokenizer: MistralTokenizer, + tokenizer: "MistralTokenizer", messages: list[ChatCompletionMessageParam], chat_template: str | None, tools: list[dict[str, Any]] | None, diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py index 67a6d7c8eb3d9..31e74b1a16e20 100644 --- a/vllm/tokenizers/__init__.py +++ b/vllm/tokenizers/__init__.py @@ -1,9 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from .deepseekv32 import DeepseekV32Tokenizer -from .hf import HfTokenizer -from .mistral import MistralTokenizer from .protocol import TokenizerLike from .registry import ( TokenizerRegistry, @@ -15,12 +12,9 @@ from .registry import ( __all__ = [ "TokenizerLike", - "HfTokenizer", - "MistralTokenizer", "TokenizerRegistry", "cached_get_tokenizer", "get_tokenizer", "cached_tokenizer_from_config", "init_tokenizer_from_config", - "DeepseekV32Tokenizer", ] diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index a7fa0f421725a..bf279a5cf67c5 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -2,24 +2,18 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path +from typing import Any from transformers import BatchEncoding +from vllm.entrypoints.chat_utils import ChatCompletionMessageParam + from .deepseek_v32_encoding import encode_messages -from .hf import HfTokenizer, TokenizerLike -from .registry import TokenizerRegistry +from .hf import CachedHfTokenizer +from .protocol import TokenizerLike -@TokenizerRegistry.register("deepseek_v32") -class DeepseekV32Tokenizer(HfTokenizer): - def __init__(self, tokenizer: TokenizerLike): - self.tokenizer = tokenizer - self.name_or_path = ( - tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else "" - ) - self._added_vocab = self.tokenizer.get_added_vocab() - self._added_vocab_size = len(self._added_vocab) - +class DeepseekV32Tokenizer(CachedHfTokenizer): @classmethod def from_pretrained( cls, @@ -40,7 +34,21 @@ class DeepseekV32Tokenizer(HfTokenizer): ) return DeepseekV32Tokenizer(tokenizer) - def apply_chat_template(self, messages, tools=None, **kwargs): + def __init__(self, tokenizer: TokenizerLike) -> None: + super().__init__() + + self.tokenizer = tokenizer + self.name_or_path = getattr(tokenizer, "name_or_path", "") + + self._added_vocab = self.tokenizer.get_added_vocab() + self._added_vocab_size = len(self._added_vocab) + + def apply_chat_template( + self, + messages: list["ChatCompletionMessageParam"], + tools: list[dict[str, Any]] | None = None, + **kwargs, + ) -> str | list[int]: thinking = kwargs.get("thinking", False) thinking_mode = "thinking" if not thinking: @@ -49,13 +57,24 @@ class DeepseekV32Tokenizer(HfTokenizer): messages = conversation.copy() if tools is not None and len(tools) > 0: messages.insert(0, {"role": "system"}) - messages[0]["tools"] = tools + messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key] # Historical reasoning content is dropped when a new user message is introduced drop_thinking = messages[-1]["role"] == "user" encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking) prompt_str = encode_messages(messages, **encode_config) # type: ignore + + if kwargs.get("tokenize", True): + tokenizer_kwargs = { + k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs + } + return self.encode( + prompt_str, + add_special_tokens=False, + **tokenizer_kwargs, + ) + return prompt_str def num_special_tokens_to_add(self) -> int: diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py index 3445073120387..a7b565dca5d8f 100644 --- a/vllm/tokenizers/hf.py +++ b/vllm/tokenizers/hf.py @@ -3,22 +3,18 @@ import contextlib import copy from pathlib import Path -from typing import TYPE_CHECKING +from typing import TypeAlias -from transformers import AutoTokenizer +from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config from .protocol import TokenizerLike -from .registry import TokenizerRegistry -if TYPE_CHECKING: - from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast +HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast -def get_cached_tokenizer( - tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast", -) -> TokenizerLike: +def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer: """ By default, transformers will recompute multiple tokenizer properties each time they are called, leading to a significant slowdown. @@ -65,11 +61,10 @@ def get_cached_tokenizer( CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}" cached_tokenizer.__class__ = CachedTokenizer - return cached_tokenizer # type: ignore + return cached_tokenizer -@TokenizerRegistry.register("hf") -class HfTokenizer(TokenizerLike): +class CachedHfTokenizer(TokenizerLike): @classmethod def from_pretrained( cls, @@ -79,7 +74,7 @@ class HfTokenizer(TokenizerLike): revision: str | None = None, download_dir: str | None = None, **kwargs, - ) -> "TokenizerLike": + ) -> HfTokenizer: try: tokenizer = AutoTokenizer.from_pretrained( path_or_repo_id, diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 1f44037dd55ec..534b0da484a5d 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -3,10 +3,11 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, cast +from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.logger import init_logger from .protocol import TokenizerLike -from .registry import TokenizerRegistry if TYPE_CHECKING: from mistral_common.protocol.instruct.request import ( @@ -15,9 +16,6 @@ if TYPE_CHECKING: from mistral_common.tokens.tokenizers.tekken import Tekkenizer from transformers import BatchEncoding - from vllm.entrypoints.chat_utils import ChatCompletionMessageParam - from vllm.entrypoints.openai.protocol import ChatCompletionRequest - try: # Transformers v5 from transformers.tokenization_mistral_common import MistralCommonBackend @@ -201,7 +199,6 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int: return tokenizer.unk_id -@TokenizerRegistry.register("mistral") class MistralTokenizer(TokenizerLike): @classmethod def from_pretrained( diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py index d6a3b0ba9b5f5..28754f9e10d00 100644 --- a/vllm/tokenizers/protocol.py +++ b/vllm/tokenizers/protocol.py @@ -97,7 +97,7 @@ class TokenizerLike(Protocol): messages: list["ChatCompletionMessageParam"], tools: list[dict[str, Any]] | None = None, **kwargs, - ) -> list[int]: + ) -> str | list[int]: raise NotImplementedError def convert_tokens_to_string(self, tokens: list[str]) -> str: diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index 1d44feeee500f..1296ce62ae693 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.util -from collections.abc import Callable +from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path -from typing import TYPE_CHECKING, TypeVar, overload +from typing import TYPE_CHECKING import huggingface_hub -from typing_extensions import assert_never +from typing_extensions import TypeVar, assert_never, deprecated import vllm.envs as envs from vllm.logger import init_logger @@ -24,46 +24,25 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from .protocol import TokenizerLike if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config.model import ModelConfig, RunnerType logger = init_logger(__name__) -_T = TypeVar("_T", bound=type[TokenizerLike]) + +_VLLM_TOKENIZERS = { + "deepseekv32": ("deepseekv32", "DeepseekV32Tokenizer"), + "hf": ("hf", "CachedHfTokenizer"), + "mistral": ("mistral", "MistralTokenizer"), +} -class TokenizerRegistry: - # Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class) - REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {} +@dataclass +class _TokenizerRegistry: + # Tokenizer mode -> (tokenizer module, tokenizer class) + tokenizers: dict[str, tuple[str, str]] = field(default_factory=dict) - # In-tree tokenizers - @staticmethod - @overload - def register(tokenizer_mode: str) -> Callable[[_T], _T]: ... - - # OOT tokenizers - @staticmethod - @overload - def register(tokenizer_mode: str, module: str, class_name: str) -> None: ... - - @staticmethod - def register( - tokenizer_mode: str, - module: str | None = None, - class_name: str | None = None, - ) -> Callable[[_T], _T] | None: - # In-tree tokenizers - if module is None or class_name is None: - - def wrapper(tokenizer_cls: _T) -> _T: - assert tokenizer_mode not in TokenizerRegistry.REGISTRY - TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls - - return tokenizer_cls - - return wrapper - - # OOT tokenizers - if tokenizer_mode in TokenizerRegistry.REGISTRY: + def register(self, tokenizer_mode: str, module: str, class_name: str) -> None: + if tokenizer_mode in self.tokenizers: logger.warning( "%s.%s is already registered for tokenizer_mode=%r. " "It is overwritten by the new one.", @@ -72,36 +51,42 @@ class TokenizerRegistry: tokenizer_mode, ) - TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name) + self.tokenizers[tokenizer_mode] = (module, class_name) return None - @staticmethod - def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike": - if tokenizer_mode not in TokenizerRegistry.REGISTRY: + def load_tokenizer_cls(self, tokenizer_mode: str) -> type[TokenizerLike]: + if tokenizer_mode not in self.tokenizers: raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.") - item = TokenizerRegistry.REGISTRY[tokenizer_mode] - if isinstance(item, type): - return item.from_pretrained(*args, **kwargs) - - module, class_name = item + module, class_name = self.tokenizers[tokenizer_mode] logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}") - class_ = resolve_obj_by_qualname(f"{module}.{class_name}") - return class_.from_pretrained(*args, **kwargs) + return resolve_obj_by_qualname(f"{module}.{class_name}") + + def load_tokenizer(self, tokenizer_mode: str, *args, **kwargs) -> TokenizerLike: + tokenizer_cls = self.load_tokenizer_cls(tokenizer_mode) + return tokenizer_cls.from_pretrained(*args, **kwargs) -def get_tokenizer( +TokenizerRegistry = _TokenizerRegistry( + { + mode: (f"vllm.tokenizers.{mod_relname}", cls_name) + for mode, (mod_relname, cls_name) in _VLLM_TOKENIZERS.items() + } +) + + +def resolve_tokenizer_args( tokenizer_name: str | Path, *args, + runner_type: "RunnerType" = "generate", tokenizer_mode: str = "auto", - trust_remote_code: bool = False, - revision: str | None = None, - download_dir: str | None = None, **kwargs, -) -> TokenizerLike: - """Gets a tokenizer for the given model name via HuggingFace or ModelScope.""" +): + revision: str | None = kwargs.get("revision") + download_dir: str | None = kwargs.get("download_dir") + if envs.VLLM_USE_MODELSCOPE: # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. @@ -125,16 +110,6 @@ def get_tokenizer( ) tokenizer_name = tokenizer_path - if tokenizer_mode == "slow": - if kwargs.get("use_fast", False): - raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") - - tokenizer_mode = "hf" - kwargs["use_fast"] = False - - if "truncation_side" not in kwargs: - kwargs["truncation_side"] = "left" - # Separate model folder from file path for GGUF models if is_gguf(tokenizer_name): if check_gguf_file(tokenizer_name): @@ -150,6 +125,21 @@ def get_tokenizer( ) kwargs["gguf_file"] = gguf_file + if "truncation_side" not in kwargs: + if runner_type == "generate" or runner_type == "draft": + kwargs["truncation_side"] = "left" + elif runner_type == "pooling": + kwargs["truncation_side"] = "right" + else: + assert_never(runner_type) + + if tokenizer_mode == "slow": + if kwargs.get("use_fast", False): + raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.") + + tokenizer_mode = "hf" + kwargs["use_fast"] = False + # Try to use official Mistral tokenizer if possible if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"): allow_patterns = ["tekken.json", "tokenizer.model.v*"] @@ -165,49 +155,70 @@ def get_tokenizer( if tokenizer_mode == "auto": tokenizer_mode = "hf" - tokenizer_args = (tokenizer_name, *args) - tokenizer_kwargs = dict( + return tokenizer_mode, tokenizer_name, args, kwargs + + +cached_resolve_tokenizer_args = lru_cache(resolve_tokenizer_args) + + +def tokenizer_args_from_config(config: "ModelConfig", **kwargs): + return cached_resolve_tokenizer_args( + config.tokenizer, + runner_type=config.runner_type, + tokenizer_mode=config.tokenizer_mode, + revision=config.tokenizer_revision, + trust_remote_code=config.trust_remote_code, + **kwargs, + ) + + +_T = TypeVar("_T", bound=TokenizerLike, default=TokenizerLike) + + +def get_tokenizer( + tokenizer_name: str | Path, + *args, + tokenizer_cls: type[_T] = TokenizerLike, # type: ignore[assignment] + trust_remote_code: bool = False, + revision: str | None = None, + download_dir: str | None = None, + **kwargs, +) -> _T: + """Gets a tokenizer for the given model name via HuggingFace or ModelScope.""" + tokenizer_mode, tokenizer_name, args, kwargs = cached_resolve_tokenizer_args( + tokenizer_name, + *args, trust_remote_code=trust_remote_code, revision=revision, download_dir=download_dir, **kwargs, ) - if tokenizer_mode == "custom": - logger.warning_once( - "TokenizerRegistry now uses `tokenizer_mode` as the registry key " - "instead of `tokenizer_name`. " - "Please update the definition of `.from_pretrained` in " - "your custom tokenizer to accept `args=%s`, `kwargs=%s`. " - "Then, you can pass `tokenizer_mode=%r` instead of " - "`tokenizer_mode='custom'` when initializing vLLM.", - tokenizer_args, - str(tokenizer_kwargs), - tokenizer_name, - ) + if tokenizer_cls == TokenizerLike: + tokenizer_cls_ = TokenizerRegistry.load_tokenizer_cls(tokenizer_mode) + else: + tokenizer_cls_ = tokenizer_cls - tokenizer_mode = str(tokenizer_name) - - tokenizer = TokenizerRegistry.get_tokenizer( - tokenizer_mode, - *tokenizer_args, - **tokenizer_kwargs, - ) + tokenizer = tokenizer_cls_.from_pretrained(tokenizer_name, *args, **kwargs) if not tokenizer.is_fast: logger.warning( "Using a slow tokenizer. This might cause a significant " "slowdown. Consider using a fast tokenizer instead." ) - return tokenizer + return tokenizer # type: ignore cached_get_tokenizer = lru_cache(get_tokenizer) def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs): + if model_config.skip_tokenizer_init: + return None + return cached_get_tokenizer( model_config.tokenizer, + runner_type=model_config.runner_type, tokenizer_mode=model_config.tokenizer_mode, revision=model_config.tokenizer_revision, trust_remote_code=model_config.trust_remote_code, @@ -215,19 +226,8 @@ def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs): ) +@deprecated( + "Renamed to `cached_tokenizer_from_config`. The old name will be removed in v0.14." +) def init_tokenizer_from_config(model_config: "ModelConfig"): - runner_type = model_config.runner_type - if runner_type == "generate" or runner_type == "draft": - truncation_side = "left" - elif runner_type == "pooling": - truncation_side = "right" - else: - assert_never(runner_type) - - return get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision, - truncation_side=truncation_side, - ) + return cached_tokenizer_from_config(model_config) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 8745e1d9dbbbc..90af573535d3b 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -60,17 +60,17 @@ def __getattr__(name: str): return cached_tokenizer_from_config if name == "init_tokenizer_from_configs": - from vllm.tokenizers import init_tokenizer_from_config + from vllm.tokenizers import cached_tokenizer_from_config warnings.warn( "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` " - "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. " + "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. " "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) - return init_tokenizer_from_config + return cached_tokenizer_from_config raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 8eff61563ccea..a6ee241c41151 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -26,7 +26,7 @@ from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask -from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.tracing import init_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value from vllm.usage.usage_lib import UsageContext @@ -111,7 +111,7 @@ class AsyncLLM(EngineClient): if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.model_config) + tokenizer = cached_tokenizer_from_config(self.model_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4422eced82fea..1011317b706d3 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -23,7 +23,7 @@ from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.tasks import SupportedTask -from vllm.tokenizers import TokenizerLike, init_tokenizer_from_config +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.tracing import init_tracer from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest @@ -86,7 +86,7 @@ class LLMEngine: if self.model_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.model_config) + tokenizer = cached_tokenizer_from_config(self.model_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 4dd478804049b..79ee4161e9dfa 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.reasoning import ReasoningParserManager -from vllm.tokenizers import init_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config from vllm.utils.import_utils import LazyLoader from vllm.v1.structured_output.backend_guidance import GuidanceBackend from vllm.v1.structured_output.backend_types import ( @@ -71,7 +71,7 @@ class StructuredOutputManager: # of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = init_tokenizer_from_config( + self.tokenizer = cached_tokenizer_from_config( model_config=self.vllm_config.model_config ) reasoning_parser = ( From 763963aa7358e19d627f1bf614a00f415a4ef6b3 Mon Sep 17 00:00:00 2001 From: Laith Sakka Date: Sat, 13 Dec 2025 18:36:53 +0300 Subject: [PATCH 57/57] set assume_32bit_indexing and pass unbacked hints (#30459) Signed-off-by: Laith Sakka --- vllm/compilation/decorators.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 31f5e78408460..f07061bdb7b2d 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -28,7 +28,7 @@ from vllm.config.compilation import DynamicShapesType from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils.import_utils import resolve_obj_by_qualname -from vllm.utils.torch_utils import supports_dynamo +from vllm.utils.torch_utils import is_torch_equal_or_newer, supports_dynamo from .monitor import start_monitoring_torch_compile @@ -316,7 +316,13 @@ def _support_torch_compile( def _mark_dynamic_inputs(mod, type, *args, **kwargs): def mark_dynamic(arg, dims): if type == DynamicShapesType.UNBACKED: - torch._dynamo.decorators.mark_unbacked(arg, dims) + if is_torch_equal_or_newer("2.10.0.dev"): + for dim in dims: + torch._dynamo.decorators.mark_unbacked( + arg, dim, hint_override=arg.size()[dim] + ) + else: + torch._dynamo.decorators.mark_unbacked(arg, dims) else: torch._dynamo.mark_dynamic(arg, dims) @@ -350,7 +356,13 @@ def _support_torch_compile( if isinstance(arg, torch.Tensor): # In case dims is specified with negative indexing dims = [arg.ndim + dim if dim < 0 else dim for dim in dims] - torch._dynamo.decorators.mark_unbacked(arg, dims) + if is_torch_equal_or_newer("2.10.0.dev"): + for dim in dims: + torch._dynamo.decorators.mark_unbacked( + arg, dim, hint_override=arg.size()[dim] + ) + else: + torch._dynamo.decorators.mark_unbacked(arg, dims) def __call__(self, *args, **kwargs): # torch.compiler.is_compiling() means we are inside the compilation @@ -488,6 +500,12 @@ def _support_torch_compile( if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS: fx_config_patches["backed_size_oblivious"] = True + # Prepare inductor config patches + # assume_32bit_indexing is only available in torch 2.10.0.dev+ + inductor_config_patches = {} + if is_torch_equal_or_newer("2.10.0.dev"): + inductor_config_patches["assume_32bit_indexing"] = True + with ( patch.object( InliningInstructionTranslator, "inline_call_", patched_inline_call @@ -496,6 +514,7 @@ def _support_torch_compile( maybe_use_cudagraph_partition_wrapper(self.vllm_config), torch.fx.experimental._config.patch(**fx_config_patches), _torch27_patch_tensor_subclasses(), + torch._inductor.config.patch(**inductor_config_patches), ): if envs.VLLM_USE_AOT_COMPILE: self.aot_compiled_fn = self.aot_compile(*args, **kwargs)