[V1][Mamba1] - Full CUDA and Piecewise CUDA Graphs Support (#23035)

Signed-off-by: asafg <asafg@ai21.com> Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com> Co-authored-by: asafg <asafg@ai21.com>
2025-12-24 00:15:01 +08:00 · 2025-08-21 06:08:51 +03:00 · 2025-08-21 06:08:51 +03:00 · 3663870c72
commit 3663870c72
parent 2461d9e562
9 changed files with 154 additions and 87 deletions
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@ -107,7 +107,7 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. Additionally, Mamba-1 models require `enforce_eager=True`.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1.
 Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@ -54,16 +54,14 @@ V1_SUPPORTED_MODELS = [
    "tiiuae/Falcon-H1-0.5B-Base",
 ]
 FULL_CUDA_GRAPH_MODELS = [
    "ai21labs/Jamba-tiny-dev",
    "Zyphra/Zamba2-1.2B-instruct",
 ]
 # Avoid OOM
 MAX_NUM_SEQS = 4
 # Once we add support for FCG in Mamba1, this list will be removed and tests
 # all test cases will use enforce_eager=False
 ENFORCE_EAGER_MODELS_V1 = [
    "state-spaces/mamba-130m-hf",
    "ai21labs/Jamba-tiny-dev",
 ]
@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@ -101,19 +99,13 @@ def test_models(
            example_prompts, max_tokens, num_logprobs)
    if model in V1_SUPPORTED_MODELS:
        enforce_eager = False
        with monkeypatch.context() as m:
            m.setenv("VLLM_USE_V1", "1")
            if model in HYBRID_MODELS:
                # required due to reorder_batch behaviour
                m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
            if model in ENFORCE_EAGER_MODELS_V1:
                enforce_eager = True
            with vllm_runner(model,
                             max_num_seqs=MAX_NUM_SEQS,
                             enforce_eager=enforce_eager,
                             enable_prefix_caching=False) as vllm_model:
                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
                    example_prompts, max_tokens, num_logprobs)
@ -373,7 +365,7 @@ def test_distributed_correctness(
    )
-@pytest.mark.parametrize("model", ["Zyphra/Zamba2-1.2B-instruct"])
+@pytest.mark.parametrize("model", FULL_CUDA_GRAPH_MODELS)
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
 def test_full_cuda_graph(
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -336,6 +336,7 @@ class CompilationConfig:
        "vllm.unified_attention",
        "vllm.unified_attention_with_output",
        "vllm.mamba_mixer2",
        "vllm.mamba_mixer",
    ]
    def compute_hash(self) -> str:
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@ -27,6 +27,8 @@ from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
    selective_scan_fn, selective_state_update)
 from vllm.model_executor.models.mamba_cache import MambaCacheParams
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
@ -183,22 +185,26 @@ class MambaMixer(MambaBase, CustomOp):
    def forward(self,
                hidden_states: torch.Tensor,
                output: torch.Tensor,
                mamba_cache_params: Optional[MambaCacheParams] = None):
        if not envs.VLLM_USE_V1:
-            return CustomOp.forward(self, hidden_states, mamba_cache_params)
+            CustomOp.forward(self, hidden_states, output, mamba_cache_params)
        else:
-            return self.forward_cuda(
+            torch.ops.vllm.mamba_mixer(
                hidden_states,
-                mamba_cache_params,
+                output,
                self.prefix,
            )
    def forward_native(self,
                       hidden_states: torch.Tensor,
                       output: torch.Tensor,
                       mamba_cache_params: Optional[MambaCacheParams] = None):
        pass
    def forward_cuda(self,
                     hidden_states: torch.Tensor,
                     output: torch.Tensor,
                     mamba_cache_params: Optional[MambaCacheParams] = None):
        """
        Run the Mamba-1 SSM pipeline.
@ -237,6 +243,7 @@ class MambaMixer(MambaBase, CustomOp):
                conv_state = self_kv_cache[0].transpose(-1, -2)
                ssm_state = self_kv_cache[1]
                has_initial_states = mamba1_metadata.has_initial_states
                num_padded_decodes = mamba1_metadata.num_padded_decodes
        else:
            assert isinstance(attn_metadata, AttentionMetadata)
            assert mamba_cache_params is not None
@ -248,6 +255,7 @@ class MambaMixer(MambaBase, CustomOp):
            has_initial_states = None
            if context_lens_tensor is not None:
                has_initial_states = context_lens_tensor > 0
            num_padded_decodes = attn_metadata.num_decode_tokens
        # 1. Gated MLP's linear projection
        projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
@ -267,6 +275,7 @@ class MambaMixer(MambaBase, CustomOp):
        num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
        has_prefill = num_prefill_tokens > 0
        has_decode = num_decode_tokens > 0
        num_actual_tokens = num_prefill_tokens + num_decode_tokens
        prefill_decode_split = split_batch_to_prefill_and_decode(
            hidden_states_BC,
@ -278,6 +287,7 @@ class MambaMixer(MambaBase, CustomOp):
            num_decode_tokens,
            num_prefills,
            num_decodes,
            num_padded_decodes,
        )
        hidden_states_BC_p = prefill_decode_split.hidden_states_BC_p
        hidden_states_BC_d = prefill_decode_split.hidden_states_BC_d
@ -371,7 +381,7 @@ class MambaMixer(MambaBase, CustomOp):
        else:
            out = self.out_proj(scan_outputs_combined.transpose(-2, -1))[0]
-        return out
+        output[:num_actual_tokens] = out
    def get_state_dtype(self) -> tuple[torch.dtype]:
        assert self.model_config is not None
@ -421,18 +431,27 @@ def split_batch_to_prefill_and_decode(
    num_decode_tokens: int,
    num_prefills: int,
    num_decodes: int,
    num_padded_decodes: int,
 ) -> PrefillDecodeSplit:
    num_actual_tokens = num_prefill_tokens + num_padded_decodes
    if envs.VLLM_USE_V1:
        # In v1, decode tokens come first, then prefill tokens.
        hidden_states_BC_d, hidden_states_BC_p = torch.split(
-            hidden_states_BC, [num_decode_tokens, num_prefill_tokens], dim=-1)
+            hidden_states_BC[..., :num_actual_tokens],
-        gate_d, gate_p = torch.split(gate,
+            [num_padded_decodes, num_prefill_tokens],
                                     [num_decode_tokens, num_prefill_tokens],
            dim=-1)
        gate_d, gate_p = torch.split(gate[..., :num_actual_tokens],
                                     [num_padded_decodes, num_prefill_tokens],
                                     dim=-1)
        # num_padded_decodes accounts for CUDA graph padding when applicable
        state_indices_tensor_d, state_indices_tensor_p = torch.split(
-            state_indices_tensor, [num_decodes, num_prefills], dim=0)
+            state_indices_tensor[:num_padded_decodes + num_prefills],
            [num_padded_decodes, num_prefills],
            dim=0)
        query_start_loc_p = (query_start_loc[-num_prefills - 1:] -
-                             num_decodes if num_prefills > 0 else None)
+                             num_padded_decodes if num_prefills > 0 else None)
        has_initial_states_p = has_initial_states[-num_prefills:] if (
            has_initial_states is not None and num_prefills > 0) else None
    else:
@ -459,3 +478,32 @@ def split_batch_to_prefill_and_decode(
        query_start_loc_p=query_start_loc_p,
        has_initial_states_p=has_initial_states_p,
    )
 def mamba_mixer(
    hidden_states: torch.Tensor,
    output: torch.Tensor,
    layer_name: str,
 ) -> None:
    forward_context: ForwardContext = get_forward_context()
    self = forward_context.no_compile_layers[layer_name]
    self.forward_cuda(hidden_states=hidden_states,
                      output=output,
                      mamba_cache_params=None)
 def mamba_mixer_fake(
    hidden_states: torch.Tensor,
    output: torch.Tensor,
    layer_name: str,
 ) -> None:
    return
 direct_register_custom_op(
    op_name="mamba_mixer",
    op_func=mamba_mixer,
    mutates_args=["output"],
    fake_impl=mamba_mixer_fake,
    dispatch_key=current_platform.dispatch_key,
 )
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@ -10,6 +10,7 @@ from transformers import JambaConfig
 from vllm import envs
 from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
@ -154,10 +155,10 @@ class JambaMambaDecoderLayer(nn.Module):
            hidden_states, residual = self.input_layernorm(
                hidden_states, residual)
-        hidden_states = self.mamba(hidden_states, mamba_cache_params)
+        output = torch.empty_like(hidden_states)
        self.mamba(hidden_states, output, mamba_cache_params)
        # Fully Connected
-        hidden_states, residual = self.pre_ff_layernorm(
+        hidden_states, residual = self.pre_ff_layernorm(output, residual)
            hidden_states, residual)
        hidden_states = self.feed_forward(hidden_states)
        return hidden_states, residual
@ -278,6 +279,7 @@ ALL_DECODER_LAYER_TYPES = {
 }
@support_torch_compile
 class JambaModel(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@ -9,6 +9,7 @@ from torch import nn
 from transformers import MambaConfig
 from vllm import envs
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
@ -81,10 +82,12 @@ class MambaDecoderLayer(nn.Module):
        else:
            hidden_states, residual = self.norm(hidden_states, residual)
-        hidden_states = self.mixer(hidden_states, mamba_cache_params)
+        output = torch.empty_like(hidden_states)
-        return hidden_states, residual
+        self.mixer(hidden_states, output, mamba_cache_params)
        return output, residual
@support_torch_compile
 class MambaModel(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@ -2,16 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 import torch
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.config import VllmConfig
+from vllm.attention.backends.utils import PAD_SLOT_ID
-from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+from vllm.v1.attention.backends.mamba_attn import (
-                                              CommonAttentionMetadata,
+    BaseMambaAttentionMetadataBuilder)
 from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
                                              split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 class Mamba1AttentionBackend(AttentionBackend):
@ -31,24 +31,11 @@ class Mamba1AttentionMetadata:
    num_prefill_tokens: int
    num_decodes: int
    num_decode_tokens: int
    num_padded_decodes: int
 class Mamba1AttentionMetadataBuilder(
-        AttentionMetadataBuilder[Mamba1AttentionMetadata]):
+        BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]):
    reorder_batch_threshold: ClassVar[int] = 1
    def __init__(
        self,
        kv_cache_spec: AttentionSpec,
        vllm_config: VllmConfig,
        device: torch.device,
        layer_names: list[str],
    ):
        assert isinstance(kv_cache_spec, MambaSpec)
        self.kv_cache_spec = kv_cache_spec
        self.device = device
        self.vllm_config = vllm_config
        self.layer_names = layer_names
    def build(
        self,
@ -67,9 +54,18 @@ class Mamba1AttentionMetadataBuilder(
                                       decode_threshold=1))
        has_initial_states = None
        padded_decodes = num_decodes
        if num_prefills > 0:
            has_initial_states = context_lens_tensor > 0
        elif (num_decodes > 0 and num_decodes <= self.decode_cudagraph_max_bs
              and self.compilation_config.full_cuda_graph):
            state_indices_for_decode = state_indices_tensor[:num_decodes]
            padded_decodes = self.vllm_config.pad_for_cudagraph(num_decodes)
            self.state_indices_tensor[:num_decodes].copy_(
                state_indices_for_decode, non_blocking=True)
            state_indices_tensor = self.state_indices_tensor[:padded_decodes]
            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
        return Mamba1AttentionMetadata(
            query_start_loc=query_start_loc,
@ -80,4 +76,5 @@ class Mamba1AttentionMetadataBuilder(
            num_prefill_tokens=num_prefill_tokens,
            num_decodes=num_decodes,
            num_decode_tokens=num_decode_tokens,
            num_padded_decodes=padded_decodes,
        )
--- a/vllm/v1/attention/backends/mamba2_attn.py
+++ b/vllm/v1/attention/backends/mamba2_attn.py
@ -2,18 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from dataclasses import dataclass
-from typing import ClassVar, Optional
+from typing import Optional
 import torch
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
-from vllm.v1.attention.backends.utils import (AttentionCGSupport,
+from vllm.v1.attention.backends.mamba_attn import (
-                                              AttentionMetadataBuilder,
+    BaseMambaAttentionMetadataBuilder)
-                                              CommonAttentionMetadata,
+from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
                                              split_decodes_and_prefills)
-from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
+from vllm.v1.kv_cache_interface import AttentionSpec
 def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor,
@ -88,29 +88,14 @@ class Mamba2AttentionMetadata:
 class Mamba2AttentionMetadataBuilder(
-        AttentionMetadataBuilder[Mamba2AttentionMetadata]):
+        BaseMambaAttentionMetadataBuilder[Mamba2AttentionMetadata]):
    cudagraph_support: ClassVar[AttentionCGSupport] = \
        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
    reorder_batch_threshold: ClassVar[int] = 1
    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                 vllm_config: VllmConfig, device: torch.device):
-        assert isinstance(kv_cache_spec, MambaSpec)
+        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
        self.kv_cache_spec = kv_cache_spec
        self.chunk_size = vllm_config.model_config.get_mamba_chunk_size()
        self.vllm_config = vllm_config
        self.compilation_config = vllm_config.compilation_config
        assert self.chunk_size is not None, (
            "chunk_size needs to be set in the model config for Mamba2 models")
        self.decode_cudagraph_max_bs = min(
            self.vllm_config.scheduler_config.max_num_seqs,
            self.compilation_config.max_capture_size)
        self.state_indices_tensor = torch.empty(
            (self.decode_cudagraph_max_bs, ),
            dtype=torch.int32,
            device=device,
        )
    def build(self,
              common_prefix_len: int,
@ -187,19 +172,3 @@ class Mamba2AttentionMetadataBuilder(
            state_indices_tensor=state_indices_tensor,
        )
        return attn_metadata
    def build_for_cudagraph_capture(
            self, common_attn_metadata: CommonAttentionMetadata):
        """
        This method builds the metadata for full cudagraph capture.
        Currently, only decode is supported for full cudagraphs with Mamba.
        """
        m = common_attn_metadata
        assert m.num_reqs == m.num_actual_tokens, \
            "Mamba only supports decode-only full CUDAGraph capture. " \
            "Make sure all cudagraph capture sizes <= max_num_seq."
        m.max_query_len = 1  # decode-only
        return self.build(0, m)
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@ -0,0 +1,55 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import abc
 from typing import ClassVar, TypeVar
 import torch
 from vllm.config import VllmConfig
 from vllm.v1.attention.backends.utils import (AttentionCGSupport,
                                              AttentionMetadataBuilder,
                                              CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 M = TypeVar("M")
 class BaseMambaAttentionMetadataBuilder(AttentionMetadataBuilder[M], abc.ABC):
    reorder_batch_threshold: ClassVar[int] = 1
    cudagraph_support: ClassVar[AttentionCGSupport] = \
        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
    def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                 vllm_config: VllmConfig, device: torch.device):
        assert isinstance(kv_cache_spec, MambaSpec)
        self.kv_cache_spec = kv_cache_spec
        self.device = device
        self.vllm_config = vllm_config
        self.layer_names = layer_names
        self.compilation_config = vllm_config.compilation_config
        self.decode_cudagraph_max_bs = min(
            self.vllm_config.scheduler_config.max_num_seqs,
            self.compilation_config.max_capture_size)
        self.state_indices_tensor = torch.empty(
            (self.decode_cudagraph_max_bs, ),
            dtype=torch.int32,
            device=device,
        )
    def build_for_cudagraph_capture(
            self, common_attn_metadata: CommonAttentionMetadata) -> M:
        """
        This method builds the metadata for full cudagraph capture.
        Currently, only decode is supported for full cudagraphs with Mamba.
        """
        m = common_attn_metadata
        assert m.num_reqs == m.num_actual_tokens, \
            "Mamba only supports decode-only full CUDAGraph capture. " \
            "Make sure all cudagraph capture sizes <= max_num_seq."
        m.max_query_len = 1  # decode-only
        return self.build(0, m)