From f0d738f0cc460b14981aab5350b86130c6e7c5ac Mon Sep 17 00:00:00 2001
From: shen-shanshan <467638484@qq.com>
Date: Tue, 23 Dec 2025 09:07:23 +0000
Subject: [PATCH 1/6] add custom op doc

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 docs/design/custom_op.md         | 236 +++++++++++++++++++++++++++++++
 vllm/config/compilation.py       |   3 +-
 vllm/model_executor/custom_op.py |   9 +-
 3 files changed, 244 insertions(+), 4 deletions(-)
 create mode 100644 docs/design/custom_op.md

diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
new file mode 100644
index 0000000000000..fee7f89171b23
--- /dev/null
+++ b/docs/design/custom_op.md
@@ -0,0 +1,236 @@
+# CustomOp
+
+`CustomOp` is an abstract class used for dispatching the forward method of various operations to the appropriate backend. It also offers a mechanism for both vLLM and OOT (Out-Of-Tree) plugins to register their custom operations.
+
+This document will introduce how CustomOp works in vLLM and how to implement a new `CustomOp`.
+
+## How CustomOp Works in vLLM
+
+`CustomOp` manages two dictionaries of all custom ops (i.e., op classes, indexed by registered name) in its class, for vLLM and OOT plugins respectively.
+
+??? code
+
+    ```python
+    class CustomOp(nn.Module):
+
+        op_registry: dict[str, type["CustomOp"]] = {}
+        op_registry_oot: dict[str, type["CustomOp"]] = {}
+    ```
+
+We can use `@CustomOp.register("op_name")` to register an op class to the `CustomOp` system. After this, the `op_name` and its class will be added into the `op_registry` dictionary. In addition, We can also register an OOT op by `@CustomOp.register_oot("op_name")`. We will introduce this mechanism in detail later.
+
+When a `CustomOp` is called (i.e., call its `forward()` method), if it is enabled, it will automatically dispatch the forward method to the appropriate backend according to `current_platform`. Otherwise (i.e., it is disabled), it will only call the `forward_native()` method to use PyTorch-native implementation of this forward method.
+
+- **CPU platform:** dispatch to `forward_cpu()`.
+- **CUDA platform:** dispatch to `forward_cuda()`.
+- **ROCm platform:** dispatch to `forward_hip()`. If `forward_hip()` is not implemented, it will use `forward_cuda()` as a fallback.
+- **XPU platform:** dispatch to `forward_xpu()`.
+- **TPU platform:** dispatch to `forward_tpu()`.
+- **OOT platform:** dispatch to `forward_oot()`. This will only be called on OOT platforms.
+- **Default:** dispatch to `forward_native()` as a final fallback for all platforms.
+
+Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`. 
+
+!!! note
+    Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`.
+
+By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with graph mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops.
+
+!!! note
+    For multi-modal models, vLLM has enforece enabled some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level.
+    
+    Note that this `enforce_enable` mechanism will be removed after we adding a separate `compilation_config` for multi-modal part.
+
+## How to Customise Your Configuration for CustomOp
+
+vLLM also offers fine-grained control over which custom ops to enable or disable for users, by manually passing a `--compilation_config.custom_ops '["..."]'` when launching a server.
+
+For example:
+
+- Use `--compilation_config.custom_ops '["all"]'` to enable all custom ops.
+- Use `--compilation_config.custom_ops '["none"]'` to disable all custom ops.
+- Use `--compilation_config.custom_ops '["all,-op1"]'` to enable all custom ops except op1 (i.e., prefixed with a `-` means "disable").
+- Use `--compilation_config.custom_ops '["none,+op1,+op2"]'` to only enable op1 and op2 (i.e., prefixed with a `+` means "enable").
+
+## Types of Supported CustomOp in vLLM
+
+| Category | OP Name | OP Class |
+|----------|---------|----------|
+| Attention | `mm_encoder_attn` | `MMEncoderAttention` |
+| Attention | `multi_head_latent_attention` | `MultiHeadLatentAttentionWrapper` |
+| Activation | `fatrelu_and_mul` | `FatreluAndMul` |
+| Activation | `silu_and_mul` | `SiluAndMul` |
+| Activation | `mul_and_silu` | `MulAndSilu` |
+| Activation | `gelu_and_mul_sparse` | `GeluAndMulSparse` |
+| Activation | `gelu_and_mul` | `GeluAndMul` |
+| Activation | `swigluoai_and_mul` | `SwigluOAIAndMul` |
+| Activation | `gelu_new` | `NewGELU` |
+| Activation | `gelu_fast` | `FastGELU` |
+| Activation | `quick_gelu` | `QuickGELU` |
+| Activation | `relu2` | `ReLUSquaredActivation` |
+| Activation | `xielu` | `XIELU` |
+| Conv | `conv2d` | `Conv2dLayer` |
+| Conv | `conv3d` | `Conv3dLayer` |
+| Conv | `short_conv` | `ShortConv` |
+| Embedding | `vocab_parallel_embedding` | `VocabParallelEmbedding` |
+| Embedding | `parallel_lm_head` | `ParallelLMHead` |
+| Linear | `row_parallel_linear` | `RowParallelLinear` |
+| Linear | `column_parallel_linear` | `ColumnParallelLinear` |
+| Linear | `replicated_linear` | `ReplicatedLinear` |
+| Logits Processor | `logits_processor` | `LogitsProcessor` |
+| Mamba | `mamba_mixer` | `MambaMixer` |
+| Mamba | `mamba_mixer2` | `MambaMixer2` |
+| Mamba | `plamo2_mamba_mixer` | `Plamo2MambaMixer` |
+| Mamba | `mixer2_gated_rms_norm` | `Mixer2RMSNormGated` |
+| MoE | `fused_moe` | `FusedMoE` |
+| MoE | `modular_fused_moe` | `FusedMoEModularMethod` |
+| MoE | `unquantized_fused_moe` | `UnquantizedFusedMoEMethod` |
+| MoE | `transformers_fused_moe` | `TransformersFusedMoE` |
+| MoE | `grouped_topk` | `GroupedTopk` |
+| Norm | `rms_norm` | `RMSNorm` |
+| Norm | `gemma_rms_norm` | `GemmaRMSNorm` |
+| Norm | `rms_norm_gated` | `RMSNormGated` |
+| Quantization | `quant_fp8` | `QuantFP8` |
+| Rope | `rotary_embedding` | `RotaryEmbeddingBase` |
+| Rope | `dual_chunk_rotary_embedding` | `DualChunkRotaryEmbedding` |
+| Rope | `apply_rotary_emb` | `ApplyRotaryEmb` |
+
+## Guidelines for Implementing a New CustomOp
+
+### Implement a New CustomOp in vLLM
+
+This part is a tutorial of how to implement a New `CustomOp` in vLLM.
+
+Steps:
+
+1. Implement a new op class, which extends from `CustomOp` base class.
+2. Add the `@CustomOp.register("op_name")` decorator on this op class to register it into `CustomOp` system.
+3. Implement different `forward_xxx()` method according to your needs.
+
+Taking `MMEncoderAttention` as an example:
+
+??? code
+
+    ```python
+    @CustomOp.register("mm_encoder_attn")
+    class MMEncoderAttention(CustomOp):
+
+        def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float | None = None,
+            num_kv_heads: int | None = None,
+            prefix: str = "",
+            multimodal_config: MultiModalConfig | None = None,
+        ) -> None:
+            super().__init__()
+            # Init...
+
+        def forward_native(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call TORCH_SDPA implementation...
+
+        def forward_cuda(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call FA or TORCH_SDPA implementation...
+
+        def forward_cpu(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call TORCH_SDPA implementation...
+
+        def forward_xpu(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call FA implementation...
+
+        def forward_tpu(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            cu_seqlens: torch.Tensor | None = None,
+            max_seqlen: torch.Tensor | None = None,  # Only used for Flash Attention
+        ) -> torch.Tensor:
+            # Call PALLAS implementation...
+    ```
+
+### Register a New CustomOp in OOT Device Plugins
+
+Currently, thanks to [vLLM's hardware-plugin mechanism](./plugin_system.md), there are various OOT device plugins emerging out to enable vLLM seamlessly runs on different hardwares. You can also find more details about this mechanism at [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
+
+- **Official device plugins:** [vllm-ascend](https://github.com/vllm-project/vllm-ascend) (for Huawei Ascend NPU), [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
+(for Spyre), [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi) (for Intel Gaudi), [vllm-neuron](https://github.com/vllm-project/vllm-neuron) (for AWS Neuron), [vllm-meta](https://github.com/vllm-project/vllm-metal) (for Apple Silicon), etc.
+- **Non-official device plugins:** [vllm-metax](https://github.com/MetaX-MACA/vLLM-metax) (for MetaX GPU), [vllm-kunlun](https://github.com/baidu/vLLM-Kunlun) (for Baidu Kunlun XPU), etc.
+
+In this case, `CustomOp` can enable these hardware manufacturers to seamlessly replace vLLM's operations with their deep-optimized kernels for specific devices at runtime, by just registering an OOT `CustomOp` and implementing the `forward_oot()` method.
+
+Now, this part will show you how to register an OOT `CustomOp` for a device plugin.
+
+Taking `MMEncoderAttention` as an example:
+
+1. Implement a `CustomMMEncoderAttention` class which extends from `MMEncoderAttention` and implement its `forward_oot()` method.
+2. Register your `CustomMMEncoderAttention` into vLLM to replace `MMEncoderAttention`.
+
+??? code
+
+    ```python
+    from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
+    from vllm.model_executor.custom_op import CustomOp
+
+
+    @CustomOp.register_oot("MMEncoderAttention")
+    class CustomMMEncoderAttention(MMEncoderAttention):
+
+        def __init__(...):
+            super().__init__(...)
+        
+        def forward_oot(...):
+            # Call optimized device-specific kernels.
+            ...
+    ```
+
+In this case, a new item `{"MMEncoderAttention": CustomMMEncoderAttention}` will be added into `op_registry_oot`. When initializing a `MMEncoderAttention` op object, if the class name (i.e., `MMEncoderAttention`) is contained in the keys of `op_registry_oot`, vLLM will replace it with our registered class (i.e., `CustomMMEncoderAttention`) and instantiate it.
+
+After that, when this `MMEncoderAttention` op is called, your `forward_oot()` will be called if it is enabled. Thus, you will get expected performance on your hardwares without directly modify vLLM.
+
+In addition, you can also register all your `CustomOp` at one place for better management.
+
+??? code
+
+    ```python
+    from vllm.model_executor.custom_op import CustomOp
+
+
+    REGISTERED_CUSTOM_OPS = {
+        "CustomOP1": YourCustomOp1,
+        "CustomOP2": YourCustomOp2,
+        "CustomOP3": YourCustomOp3,
+    }
+
+    for op_name, op_cls in REGISTERED_CUSTOM_OPS.items():
+        CustomOp.register_oot(_decorated_op_cls=op_cls, name=op_name)
+    ```
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index cd527e4198557..56e69541e6b81 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -404,7 +404,8 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
+    disabled when running with Inductor: mode>=CompilationMode.NONE and
+    backend="inductor".
     Inductor generates (fused) Triton kernels for disabled custom ops."""
     splitting_ops: list[str] | None = None
     """A list of ops to exclude from cudagraphs, used in piecewise compilation.
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 66250f816f459..371b691759348 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -86,9 +86,12 @@ class CustomOp(nn.Module):
         # specific backend. Currently, we do not support dynamic dispatching.
         compilation_config = get_cached_compilation_config()
 
-        # CustomOp object can be enforce enabled, e.g., enable device-specific
-        # kernels in ViT models when enabling graph mode. By default, it will
-        # follow the compilation_config to determine whether enable itself.
+        # NOTE(shen-shanshan): CustomOp object can be enforce enabled, e.g.,
+        # enable device-specific kernels in ViT models when enabling graph
+        # mode. By default, it will follow the compilation_config to determine
+        # whether enable itself.
+        # This enforce_enable mechanism will be removed after we adding a
+        # separate compilation_config for multi-modal part.
         enabled = self._enforce_enable or self.enabled()
         if enabled:
             compilation_config.enabled_custom_ops.update([self.__class__.name])

From e391c275a18a9063ccb0c9720fde4de7c360e17e Mon Sep 17 00:00:00 2001
From: shen-shanshan <467638484@qq.com>
Date: Tue, 23 Dec 2025 09:44:50 +0000
Subject: [PATCH 2/6] fix lint

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 docs/design/custom_op.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
index fee7f89171b23..60716d4f88246 100644
--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -29,7 +29,7 @@ When a `CustomOp` is called (i.e., call its `forward()` method), if it is enable
 - **OOT platform:** dispatch to `forward_oot()`. This will only be called on OOT platforms.
 - **Default:** dispatch to `forward_native()` as a final fallback for all platforms.
 
-Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`. 
+Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`.
 
 !!! note
     Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`.
@@ -38,7 +38,7 @@ By default, if `compilation_config.backend == "inductor"` and `compilation_confi
 
 !!! note
     For multi-modal models, vLLM has enforece enabled some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level.
-    
+
     Note that this `enforce_enable` mechanism will be removed after we adding a separate `compilation_config` for multi-modal part.
 
 ## How to Customise Your Configuration for CustomOp

From e1c9d6e7e09903a09359f7f670d5973fb924d4ac Mon Sep 17 00:00:00 2001
From: shen-shanshan <467638484@qq.com>
Date: Wed, 24 Dec 2025 02:06:03 +0000
Subject: [PATCH 3/6] fix

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 docs/design/custom_op.md   | 5 ++++-
 vllm/config/compilation.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
index 60716d4f88246..64822412c93fa 100644
--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -29,12 +29,15 @@ When a `CustomOp` is called (i.e., call its `forward()` method), if it is enable
 - **OOT platform:** dispatch to `forward_oot()`. This will only be called on OOT platforms.
 - **Default:** dispatch to `forward_native()` as a final fallback for all platforms.
 
+!!! note
+    Note that the dispatching logic might not be absolute because of class inheritance. Derived class might override the behavior.
+
 Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`.
 
 !!! note
     Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`.
 
-By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with graph mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops.
+By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with torch compile mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops.
 
 !!! note
     For multi-modal models, vLLM has enforece enabled some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level.
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 56e69541e6b81..7e3beaa40e5a0 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -404,7 +404,7 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=CompilationMode.NONE and
+    disabled when running with Inductor: mode>CompilationMode.NONE and
     backend="inductor".
     Inductor generates (fused) Triton kernels for disabled custom ops."""
     splitting_ops: list[str] | None = None

From a2c7852e4c3577ddc0947d50ba3551c4f3d2a302 Mon Sep 17 00:00:00 2001
From: shen-shanshan <467638484@qq.com>
Date: Wed, 24 Dec 2025 03:21:23 +0000
Subject: [PATCH 4/6] use snippest

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 docs/design/custom_op.md                      | 110 +++++++++++-------
 vllm/attention/layers/mm_encoder_attention.py |   3 +
 vllm/model_executor/layers/activation.py      |  33 ++++++
 vllm/model_executor/layers/conv.py            |   6 +
 .../layers/fused_moe/fused_moe.py             |   3 +
 .../fused_moe/fused_moe_modular_method.py     |   3 +
 vllm/model_executor/layers/fused_moe/layer.py |   3 +
 .../fused_moe/unquantized_fused_moe_method.py |   3 +
 vllm/model_executor/layers/layernorm.py       |   9 ++
 vllm/model_executor/layers/linear.py          |   9 ++
 .../model_executor/layers/logits_processor.py |   3 +
 .../layers/mamba/mamba_mixer.py               |   3 +
 .../layers/mamba/mamba_mixer2.py              |   6 +
 .../model_executor/layers/mamba/short_conv.py |   3 +
 vllm/model_executor/layers/mla.py             |   3 +
 .../layers/quantization/input_quant_fp8.py    |   3 +
 .../layers/rotary_embedding/base.py           |   3 +
 .../layers/rotary_embedding/common.py         |   3 +
 .../rotary_embedding/dual_chunk_rope.py       |   3 +
 .../layers/vocab_parallel_embedding.py        |   6 +
 vllm/model_executor/models/plamo2.py          |   5 +-
 .../model_executor/models/transformers/moe.py |   3 +
 22 files changed, 185 insertions(+), 41 deletions(-)

diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
index 64822412c93fa..bf386b94e92c6 100644
--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -57,46 +57,76 @@ For example:
 
 ## Types of Supported CustomOp in vLLM
 
-| Category | OP Name | OP Class |
-|----------|---------|----------|
-| Attention | `mm_encoder_attn` | `MMEncoderAttention` |
-| Attention | `multi_head_latent_attention` | `MultiHeadLatentAttentionWrapper` |
-| Activation | `fatrelu_and_mul` | `FatreluAndMul` |
-| Activation | `silu_and_mul` | `SiluAndMul` |
-| Activation | `mul_and_silu` | `MulAndSilu` |
-| Activation | `gelu_and_mul_sparse` | `GeluAndMulSparse` |
-| Activation | `gelu_and_mul` | `GeluAndMul` |
-| Activation | `swigluoai_and_mul` | `SwigluOAIAndMul` |
-| Activation | `gelu_new` | `NewGELU` |
-| Activation | `gelu_fast` | `FastGELU` |
-| Activation | `quick_gelu` | `QuickGELU` |
-| Activation | `relu2` | `ReLUSquaredActivation` |
-| Activation | `xielu` | `XIELU` |
-| Conv | `conv2d` | `Conv2dLayer` |
-| Conv | `conv3d` | `Conv3dLayer` |
-| Conv | `short_conv` | `ShortConv` |
-| Embedding | `vocab_parallel_embedding` | `VocabParallelEmbedding` |
-| Embedding | `parallel_lm_head` | `ParallelLMHead` |
-| Linear | `row_parallel_linear` | `RowParallelLinear` |
-| Linear | `column_parallel_linear` | `ColumnParallelLinear` |
-| Linear | `replicated_linear` | `ReplicatedLinear` |
-| Logits Processor | `logits_processor` | `LogitsProcessor` |
-| Mamba | `mamba_mixer` | `MambaMixer` |
-| Mamba | `mamba_mixer2` | `MambaMixer2` |
-| Mamba | `plamo2_mamba_mixer` | `Plamo2MambaMixer` |
-| Mamba | `mixer2_gated_rms_norm` | `Mixer2RMSNormGated` |
-| MoE | `fused_moe` | `FusedMoE` |
-| MoE | `modular_fused_moe` | `FusedMoEModularMethod` |
-| MoE | `unquantized_fused_moe` | `UnquantizedFusedMoEMethod` |
-| MoE | `transformers_fused_moe` | `TransformersFusedMoE` |
-| MoE | `grouped_topk` | `GroupedTopk` |
-| Norm | `rms_norm` | `RMSNorm` |
-| Norm | `gemma_rms_norm` | `GemmaRMSNorm` |
-| Norm | `rms_norm_gated` | `RMSNormGated` |
-| Quantization | `quant_fp8` | `QuantFP8` |
-| Rope | `rotary_embedding` | `RotaryEmbeddingBase` |
-| Rope | `dual_chunk_rotary_embedding` | `DualChunkRotaryEmbedding` |
-| Rope | `apply_rotary_emb` | `ApplyRotaryEmb` |
+**1. Attention:**
+
+--8<-- "../../vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn"
+--8<-- "../../vllm/model_executor/layers/mla.py:multi_head_latent_attention"
+
+**2. Activation:**
+
+--8<-- "../../vllm/model_executor/layers/activation.py:silu_and_mul"
+--8<-- "../../vllm/model_executor/layers/activation.py:mul_and_silu"
+--8<-- "../../vllm/model_executor/layers/activation.py:gelu_new"
+--8<-- "../../vllm/model_executor/layers/activation.py:gelu_fast"
+--8<-- "../../vllm/model_executor/layers/activation.py:quick_gelu"
+--8<-- "../../vllm/model_executor/layers/activation.py:gelu_and_mul"
+--8<-- "../../vllm/model_executor/layers/activation.py:gelu_and_mul_sparse"
+--8<-- "../../vllm/model_executor/layers/activation.py:relu2"
+--8<-- "../../vllm/model_executor/layers/activation.py:xielu"
+--8<-- "../../vllm/model_executor/layers/activation.py:swigluoai_and_mul"
+--8<-- "../../vllm/model_executor/layers/activation.py:fatrelu_and_mul"
+
+**3. MM-Conv:**
+
+--8<-- "../../vllm/model_executor/layers/conv.py:conv2d"
+--8<-- "../../vllm/model_executor/layers/conv.py:conv3d"
+
+**4. Embedding:**
+
+--8<-- "../../vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding"
+--8<-- "../../vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head"
+
+**5. Linear:**
+
+--8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear"
+--8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear:column_parallel_linear"
+--8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear:replicated_linear"
+
+**6. Logits Processor:**
+
+--8<-- "../../vllm/model_executor/layers/logits_processor.py:logits_processor"
+
+**7. Mamba:**
+
+--8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer"
+--8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2"
+--8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm"
+--8<-- "../../vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer"
+--8<-- "../../vllm/model_executor/layers/mamba/short_conv.py:short_conv"
+
+**8. MoE:**
+
+--8<-- "../../vllm/model_executor/layers/fused_moe/layer.py:fused_moe"
+--8<-- "../../vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe"
+--8<-- "../../vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe"
+--8<-- "../../vllm/model_executor/models/transformers/moe.py:transformers_fused_moe"
+--8<-- "../../vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk"
+
+**9. Norm:**
+
+--8<-- "../../vllm/model_executor/layers/layernorm.py:rms_norm"
+--8<-- "../../vllm/model_executor/layers/layernorm.py:rms_norm_gated"
+--8<-- "../../vllm/model_executor/layers/layernorm.py:gemma_rms_norm"
+
+**10. Quantization:**
+
+--8<-- "../../vllm/model_executor/layers/quantization/input_quant_fp8.py:quant_fp8"
+
+**11. Rope:**
+
+--8<-- "../../vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding"
+--8<-- "../../vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding"
+--8<-- "../../vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb"
 
 ## Guidelines for Implementing a New CustomOp
 
diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py
index 25f54cc867b5a..44c3d646f7c46 100644
--- a/vllm/attention/layers/mm_encoder_attention.py
+++ b/vllm/attention/layers/mm_encoder_attention.py
@@ -18,10 +18,13 @@ from vllm.model_executor.models.vision import get_vit_attn_backend
 logger = init_logger(__name__)
 
 
+# --8<-- [start:mm_encoder_attn]
 @CustomOp.register("mm_encoder_attn")
 class MMEncoderAttention(CustomOp):
     """Multi-headed attention without any cache, used for multimodal encoder."""
 
+    # --8<-- [end:mm_encoder_attn]
+
     def __init__(
         self,
         num_heads: int,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 7038d0868c7eb..5e904b9070cc1 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -22,6 +22,7 @@ from vllm.utils.collection_utils import LazyDict
 logger = init_logger(__name__)
 
 
+# --8<-- [start:fatrelu_and_mul]
 @CustomOp.register("fatrelu_and_mul")
 class FatreluAndMul(CustomOp):
     """An activation function for FATReLU.
@@ -35,6 +36,8 @@ class FatreluAndMul(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    # --8<-- [end:fatrelu_and_mul]
+
     def __init__(self, threshold: float = 0.0):
         super().__init__()
         self.threshold = threshold
@@ -58,6 +61,7 @@ class FatreluAndMul(CustomOp):
         return out
 
 
+# --8<-- [start:silu_and_mul]
 @CustomOp.register("silu_and_mul")
 class SiluAndMul(CustomOp):
     """An activation function for SwiGLU.
@@ -69,6 +73,8 @@ class SiluAndMul(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    # --8<-- [end:silu_and_mul]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike():
@@ -101,6 +107,7 @@ class SiluAndMul(CustomOp):
         return out
 
 
+# --8<-- [start:mul_and_silu]
 @CustomOp.register("mul_and_silu")
 class MulAndSilu(CustomOp):
     """An activation function for SwiGLU.
@@ -112,6 +119,8 @@ class MulAndSilu(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    # --8<-- [end:mul_and_silu]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike():
@@ -139,6 +148,7 @@ class MulAndSilu(CustomOp):
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
 
+# --8<-- [start:gelu_and_mul_sparse]
 @CustomOp.register("gelu_and_mul_sparse")
 class GeluAndMulSparse(CustomOp):
     """An activation function for GeluAndMulSparse.
@@ -153,6 +163,8 @@ class GeluAndMulSparse(CustomOp):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
+    # --8<-- [end:gelu_and_mul_sparse]
+
     def __init__(self, activation_sparsity: float, approximate: str = "none"):
         super().__init__()
         # Gelu.
@@ -195,6 +207,7 @@ class GeluAndMulSparse(CustomOp):
         return self.forward_native(x)
 
 
+# --8<-- [start:gelu_and_mul]
 @CustomOp.register("gelu_and_mul")
 class GeluAndMul(CustomOp):
     """An activation function for GeGLU.
@@ -206,6 +219,8 @@ class GeluAndMul(CustomOp):
         return: (batch_size, seq_len, d) or (num_tokens, d)
     """
 
+    # --8<-- [end:gelu_and_mul]
+
     def __init__(self, approximate: str = "none"):
         super().__init__()
         self.approximate = approximate
@@ -257,9 +272,12 @@ class GeluAndMul(CustomOp):
         return f"approximate={repr(self.approximate)}"
 
 
+# --8<-- [start:swigluoai_and_mul]
 @CustomOp.register("swigluoai_and_mul")
 class SwigluOAIAndMul(CustomOp):
     # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
+    # --8<-- [end:swigluoai_and_mul]
+
     def __init__(self, alpha: float = 1.702, limit: float = 7.0):
         super().__init__()
         self.alpha = alpha
@@ -286,8 +304,11 @@ class SwigluOAIAndMul(CustomOp):
         return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
 
 
+# --8<-- [start:gelu_new]
 @CustomOp.register("gelu_new")
 class NewGELU(CustomOp):
+    # --8<-- [end:gelu_new]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
@@ -311,8 +332,11 @@ class NewGELU(CustomOp):
         return self.op(x)
 
 
+# --8<-- [start:gelu_fast]
 @CustomOp.register("gelu_fast")
 class FastGELU(CustomOp):
+    # --8<-- [end:gelu_fast]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
@@ -335,9 +359,12 @@ class FastGELU(CustomOp):
         return self.op(x)
 
 
+# --8<-- [start:quick_gelu]
 @CustomOp.register("quick_gelu")
 class QuickGELU(CustomOp):
     # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
+    # --8<-- [end:quick_gelu]
+
     def __init__(self):
         super().__init__()
         if current_platform.is_cuda_alike() or current_platform.is_cpu():
@@ -365,12 +392,15 @@ class QuickGELU(CustomOp):
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
 
+# --8<-- [start:relu2]
 @CustomOp.register("relu2")
 class ReLUSquaredActivation(CustomOp):
     """
     Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
     """
 
+    # --8<-- [end:relu2]
+
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return torch.square(F.relu(x))
@@ -380,6 +410,7 @@ class ReLUSquaredActivation(CustomOp):
         return self.forward_native(x)
 
 
+# --8<-- [start:xielu]
 @CustomOp.register("xielu")
 class XIELU(CustomOp):
     """
@@ -388,6 +419,8 @@ class XIELU(CustomOp):
     Otherwise, we emit a single warning and use xIELU Python
     """
 
+    # --8<-- [end:xielu]
+
     def __init__(
         self,
         alpha_p_init: float = 0.8,
diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
index 1cd02698b3863..f4709f2f4d80f 100644
--- a/vllm/model_executor/layers/conv.py
+++ b/vllm/model_executor/layers/conv.py
@@ -105,10 +105,13 @@ class ConvLayerBase(CustomOp):
         return s
 
 
+# --8<-- [start:conv2d]
 @CustomOp.register("conv2d")
 class Conv2dLayer(ConvLayerBase):
     """Conv layer with Conv2d."""
 
+    # --8<-- [end:conv2d]
+
     num_dim = 2
 
     def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
@@ -204,10 +207,13 @@ class CausalConv2dLayer(Conv2dLayer):
         return x
 
 
+# --8<-- [start:conv3d]
 @CustomOp.register("conv3d")
 class Conv3dLayer(ConvLayerBase):
     """Conv layer with Conv3d."""
 
+    # --8<-- [end:conv3d]
+
     num_dim = 3
 
     def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index bf51554341607..fae093b6c72f3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1283,10 +1283,13 @@ def grouped_topk(
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
+# --8<-- [start:grouped_topk]
 @CustomOp.register("grouped_topk")
 class GroupedTopk(CustomOp):
     """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model."""
 
+    # --8<-- [end:grouped_topk]
+
     def __init__(
         self,
         topk: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 30ff1bf2f008a..c930a63a2c1ac 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -20,8 +20,11 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
 logger = init_logger(__name__)
 
 
+# --8<-- [start:modular_fused_moe]
 @CustomOp.register("modular_fused_moe")
 class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
+    # --8<-- [end:modular_fused_moe]
+
     def __init__(
         self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel
     ):
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 2e7267d56d838..f54d942683d02 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -297,6 +297,7 @@ def maybe_roundup_hidden_size(
     return hidden_size
 
 
+# --8<-- [start:fused_moe]
 @CustomOp.register("fused_moe")
 class FusedMoE(CustomOp):
     """FusedMoE layer for MoE models.
@@ -320,6 +321,8 @@ class FusedMoE(CustomOp):
         enable_eplb: Whether to enable expert parallelism load balancer.
     """
 
+    # --8<-- [end:fused_moe]
+
     def __init__(
         self,
         num_experts: int,  # Global number of experts
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 82dbccf3fa9da..85a7782fbf497 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -46,10 +46,13 @@ else:
 logger = init_logger(__name__)
 
 
+# --8<-- [start:unquantized_fused_moe]
 @CustomOp.register("unquantized_fused_moe")
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
+    # --8<-- [end:unquantized_fused_moe]
+
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 8cc374ac9155d..d962ab8bbc22d 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -88,6 +88,7 @@ def dispatch_rocm_rmsnorm_func(
     return rms_norm
 
 
+# --8<-- [start:rms_norm]
 @CustomOp.register("rms_norm")
 class RMSNorm(CustomOp):
     """Root mean square normalization.
@@ -96,6 +97,8 @@ class RMSNorm(CustomOp):
     Refer to https://arxiv.org/abs/1910.07467
     """
 
+    # --8<-- [end:rms_norm]
+
     def __init__(
         self,
         hidden_size: int,
@@ -253,6 +256,7 @@ class RMSNorm(CustomOp):
         return s
 
 
+# --8<-- [start:gemma_rms_norm]
 @CustomOp.register("gemma_rms_norm")
 class GemmaRMSNorm(CustomOp):
     """RMS normalization for Gemma.
@@ -262,6 +266,8 @@ class GemmaRMSNorm(CustomOp):
         2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
     """
 
+    # --8<-- [end:gemma_rms_norm]
+
     def __init__(
         self,
         hidden_size: int,
@@ -321,6 +327,7 @@ class GemmaRMSNorm(CustomOp):
         return self.forward_native(x, residual)
 
 
+# --8<-- [start:rms_norm_gated]
 @CustomOp.register("rms_norm_gated")
 class RMSNormGated(CustomOp):
     """RMS Normalization with optional gating.
@@ -331,6 +338,8 @@ class RMSNormGated(CustomOp):
     - Optional gating with SiLU activation
     """
 
+    # --8<-- [end:rms_norm_gated]
+
     def __init__(
         self,
         hidden_size: int,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 402f0bf69ceaa..e4131a5994bb1 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -296,6 +296,7 @@ class LinearBase(CustomOp):
                 param.tp_size = self.tp_size
 
 
+# --8<-- [start:replicated_linear]
 @CustomOp.register("replicated_linear")
 class ReplicatedLinear(LinearBase):
     """Replicated linear layer.
@@ -313,6 +314,8 @@ class ReplicatedLinear(LinearBase):
         disable_tp: Take no effect for replicated linear layers.
     """
 
+    # --8<-- [end:replicated_linear]
+
     def __init__(
         self,
         input_size: int,
@@ -413,6 +416,7 @@ class ReplicatedLinear(LinearBase):
         return s
 
 
+# --8<-- [start:column_parallel_linear]
 @CustomOp.register("column_parallel_linear")
 class ColumnParallelLinear(LinearBase):
     """Linear layer with column parallelism.
@@ -440,6 +444,8 @@ class ColumnParallelLinear(LinearBase):
         disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
+    # --8<-- [end:column_parallel_linear]
+
     def __init__(
         self,
         input_size: int,
@@ -1276,6 +1282,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         param_data.copy_(loaded_weight)
 
 
+# --8<-- [start:row_parallel_linear]
 @CustomOp.register("row_parallel_linear")
 class RowParallelLinear(LinearBase):
     """Linear layer with row parallelism.
@@ -1310,6 +1317,8 @@ class RowParallelLinear(LinearBase):
         disable_tp: If true, weights matrix won't be sharded through tp rank.
     """
 
+    # --8<-- [end:row_parallel_linear]
+
     def __init__(
         self,
         input_size: int,
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index c8d57f597d1ca..38753b0fcc748 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmb
 from vllm.platforms import current_platform
 
 
+# --8<-- [start:logits_processor]
 @CustomOp.register("logits_processor")
 class LogitsProcessor(CustomOp):
     """Process logits and apply logits processors from sampling metadata.
@@ -23,6 +24,8 @@ class LogitsProcessor(CustomOp):
     3. Apply logits processors (if any).
     """
 
+    # --8<-- [end:logits_processor]
+
     def __init__(
         self,
         vocab_size: int,
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 0b63acf2dc5a5..0fa11cf02eede 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -39,6 +39,7 @@ from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
 
 
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+# --8<-- [start:mamba_mixer]
 @CustomOp.register("mamba_mixer")
 class MambaMixer(MambaBase, CustomOp):
     """
@@ -51,6 +52,8 @@ class MambaMixer(MambaBase, CustomOp):
     **selective** state spaces)
     """
 
+    # --8<-- [end:mamba_mixer]
+
     def __init__(
         self,
         hidden_size: int,
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 0ea5805305eda..875bc9019fbac 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -49,8 +49,11 @@ from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
 
 
 # Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated
+# --8<-- [start:mixer2_gated_rms_norm]
 @CustomOp.register("mixer2_gated_rms_norm")
 class Mixer2RMSNormGated(CustomOp):
+    # --8<-- [end:mixer2_gated_rms_norm]
+
     def __init__(
         self,
         full_hidden_size: int,
@@ -214,6 +217,7 @@ def mamba_v2_sharded_weight_loader(
 
 
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+# --8<-- [start:mamba_mixer2]
 @CustomOp.register("mamba_mixer2")
 class MambaMixer2(MambaBase, CustomOp):
     """
@@ -226,6 +230,8 @@ class MambaMixer2(MambaBase, CustomOp):
     **selective** state spaces)
     """
 
+    # --8<-- [end:mamba_mixer2]
+
     def __init__(
         self,
         hidden_size: int,
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index 0bbad17d7ebc7..af9cd4b6d39b8 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -27,8 +27,11 @@ from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata
 
 
+# --8<-- [start:short_conv]
 @CustomOp.register("short_conv")
 class ShortConv(MambaBase, CustomOp):
+    # --8<-- [end:short_conv]
+
     def __init__(
         self,
         config,
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index 1656f4deb6717..65541d2a485a8 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -29,6 +29,7 @@ class MLAModules:
     indexer_rotary_emb: torch.nn.Module | None = None
 
 
+# --8<-- [start:multi_head_latent_attention]
 @CustomOp.register("multi_head_latent_attention")
 class MultiHeadLatentAttentionWrapper(CustomOp):
     """MLA layer registered as CustomOp to allow OOT backends to add
@@ -47,6 +48,8 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
     3. Return the output tensor.
     """
 
+    # --8<-- [end:multi_head_latent_attention]
+
     def __init__(
         self,
         hidden_size: int,
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index 7994c838ad548..6f43eac14e8c7 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -19,6 +19,7 @@ _FP8_MIN = -224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.min
 _FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0)
 
 
+# --8<-- [start:quant_fp8]
 @CustomOp.register("quant_fp8")
 class QuantFP8(CustomOp):
     """
@@ -26,6 +27,8 @@ class QuantFP8(CustomOp):
     This CustomOp supports both static and dynamic quantization.
     """
 
+    # --8<-- [end:quant_fp8]
+
     def __init__(
         self,
         static: bool,
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index 7e83ea9a1355b..155d6c5541b50 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -10,10 +10,13 @@ from vllm.model_executor.custom_op import CustomOp
 from .common import ApplyRotaryEmb
 
 
+# --8<-- [start:rotary_embedding]
 @CustomOp.register("rotary_embedding")
 class RotaryEmbeddingBase(CustomOp):
     """Original rotary positional embedding."""
 
+    # --8<-- [end:rotary_embedding]
+
     def __init__(
         self,
         head_size: int,
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index b86cd9f001d61..dba19471eb618 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -118,8 +118,11 @@ direct_register_custom_op(
 )
 
 
+# --8<-- [start:apply_rotary_emb]
 @CustomOp.register("apply_rotary_emb")
 class ApplyRotaryEmb(CustomOp):
+    # --8<-- [end:apply_rotary_emb]
+
     def __init__(
         self,
         enforce_enable: bool = False,
diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
index b5dd94cc7f531..e5dabe035b34e 100644
--- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -9,10 +9,13 @@ from vllm.model_executor.custom_op import CustomOp
 from .common import rotate_gptj, rotate_neox
 
 
+# --8<-- [start:dual_chunk_rotary_embedding]
 @CustomOp.register("dual_chunk_rotary_embedding")
 class DualChunkRotaryEmbedding(CustomOp):
     """Rotary positional embedding for Dual Chunk Attention."""
 
+    # --8<-- [end:dual_chunk_rotary_embedding]
+
     def __init__(
         self,
         head_size: int,
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 1abc3ad884550..daaa86bed4786 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -181,6 +181,7 @@ def get_masked_input_and_mask(
     return input_, ~vocab_mask
 
 
+# --8<-- [start:vocab_parallel_embedding]
 @CustomOp.register("vocab_parallel_embedding")
 class VocabParallelEmbedding(CustomOp):
     """Embedding parallelized in the vocabulary dimension.
@@ -221,6 +222,8 @@ class VocabParallelEmbedding(CustomOp):
         prefix: full name of the layer in the state dict
     """  # noqa: E501
 
+    # --8<-- [end:vocab_parallel_embedding]
+
     def __init__(
         self,
         num_embeddings: int,
@@ -492,6 +495,7 @@ class VocabParallelEmbedding(CustomOp):
         return s
 
 
+# --8<-- [start:parallel_lm_head]
 @CustomOp.register("parallel_lm_head")
 class ParallelLMHead(VocabParallelEmbedding):
     """Parallelized LM head.
@@ -509,6 +513,8 @@ class ParallelLMHead(VocabParallelEmbedding):
         padding_size: padding size for the vocabulary.
     """
 
+    # --8<-- [end:parallel_lm_head]
+
     def __init__(
         self,
         num_embeddings: int,
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 6765ee0c5779c..b957abd2e8760 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -97,8 +97,11 @@ def is_mamba(config: Plamo2Config, i: int) -> bool:
 # Adapted from:
 # vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2
 # transformers.models.mamba.modeling_mamba.MambaMixer
-@CustomOp.register(name="plamo2_mamba_mixer")
+# --8<-- [start:plamo2_mamba_mixer]
+@CustomOp.register("plamo2_mamba_mixer")
 class Plamo2MambaMixer(MambaBase, CustomOp):
+    # --8<-- [end:plamo2_mamba_mixer]
+
     def __init__(self, vllm_config: VllmConfig, *, prefix: str = "", **kwargs) -> None:
         super().__init__()
         self.config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
index 31db9d682bd40..46bd475c01323 100644
--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -37,10 +37,13 @@ if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
 
+# --8<-- [start:transformers_fused_moe]
 @CustomOp.register("transformers_fused_moe")
 class TransformersFusedMoE(FusedMoE):
     """Custom FusedMoE for the Transformers modeling backend."""
 
+    # --8<-- [end:transformers_fused_moe]
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._topk_ids: torch.Tensor = None

From 7fdd6a11fae521326bcbf7b836fda3b0f6a45bc9 Mon Sep 17 00:00:00 2001
From: shen-shanshan <467638484@qq.com>
Date: Wed, 24 Dec 2025 06:41:36 +0000
Subject: [PATCH 5/6] update

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 docs/design/custom_op.md | 98 ++++++++++++++++++++++++----------------
 1 file changed, 60 insertions(+), 38 deletions(-)

diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
index bf386b94e92c6..0345e42d07e79 100644
--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -59,74 +59,96 @@ For example:
 
 **1. Attention:**
 
---8<-- "../../vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn"
---8<-- "../../vllm/model_executor/layers/mla.py:multi_head_latent_attention"
+```python
+--8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn"
+--8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
+```
 
 **2. Activation:**
 
---8<-- "../../vllm/model_executor/layers/activation.py:silu_and_mul"
---8<-- "../../vllm/model_executor/layers/activation.py:mul_and_silu"
---8<-- "../../vllm/model_executor/layers/activation.py:gelu_new"
---8<-- "../../vllm/model_executor/layers/activation.py:gelu_fast"
---8<-- "../../vllm/model_executor/layers/activation.py:quick_gelu"
---8<-- "../../vllm/model_executor/layers/activation.py:gelu_and_mul"
---8<-- "../../vllm/model_executor/layers/activation.py:gelu_and_mul_sparse"
---8<-- "../../vllm/model_executor/layers/activation.py:relu2"
---8<-- "../../vllm/model_executor/layers/activation.py:xielu"
---8<-- "../../vllm/model_executor/layers/activation.py:swigluoai_and_mul"
---8<-- "../../vllm/model_executor/layers/activation.py:fatrelu_and_mul"
+```python
+--8<-- "vllm/model_executor/layers/activation.py:silu_and_mul"
+--8<-- "vllm/model_executor/layers/activation.py:mul_and_silu"
+--8<-- "vllm/model_executor/layers/activation.py:gelu_new"
+--8<-- "vllm/model_executor/layers/activation.py:gelu_fast"
+--8<-- "vllm/model_executor/layers/activation.py:quick_gelu"
+--8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul"
+--8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul_sparse"
+--8<-- "vllm/model_executor/layers/activation.py:relu2"
+--8<-- "vllm/model_executor/layers/activation.py:xielu"
+--8<-- "vllm/model_executor/layers/activation.py:swigluoai_and_mul"
+--8<-- "vllm/model_executor/layers/activation.py:fatrelu_and_mul"
+```
 
 **3. MM-Conv:**
 
---8<-- "../../vllm/model_executor/layers/conv.py:conv2d"
---8<-- "../../vllm/model_executor/layers/conv.py:conv3d"
+```python
+--8<-- "vllm/model_executor/layers/conv.py:conv2d"
+--8<-- "vllm/model_executor/layers/conv.py:conv3d"
+```
 
 **4. Embedding:**
 
---8<-- "../../vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding"
---8<-- "../../vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head"
+```python
+--8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding"
+--8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head"
+```
 
 **5. Linear:**
 
---8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear"
---8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear:column_parallel_linear"
---8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear:replicated_linear"
+```python
+--8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear"
+--8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear:column_parallel_linear"
+--8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear:replicated_linear"
+```
 
 **6. Logits Processor:**
 
---8<-- "../../vllm/model_executor/layers/logits_processor.py:logits_processor"
+```python
+--8<-- "vllm/model_executor/layers/logits_processor.py:logits_processor"
+```
 
 **7. Mamba:**
 
---8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer"
---8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2"
---8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm"
---8<-- "../../vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer"
---8<-- "../../vllm/model_executor/layers/mamba/short_conv.py:short_conv"
+```python
+--8<-- "vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer"
+--8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2"
+--8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm"
+--8<-- "vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer"
+--8<-- "vllm/model_executor/layers/mamba/short_conv.py:short_conv"
+```
 
 **8. MoE:**
 
---8<-- "../../vllm/model_executor/layers/fused_moe/layer.py:fused_moe"
---8<-- "../../vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe"
---8<-- "../../vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe"
---8<-- "../../vllm/model_executor/models/transformers/moe.py:transformers_fused_moe"
---8<-- "../../vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk"
+```python
+--8<-- "vllm/model_executor/layers/fused_moe/layer.py:fused_moe"
+--8<-- "vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe"
+--8<-- "vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe"
+--8<-- "vllm/model_executor/models/transformers/moe.py:transformers_fused_moe"
+--8<-- "vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk"
+```
 
 **9. Norm:**
 
---8<-- "../../vllm/model_executor/layers/layernorm.py:rms_norm"
---8<-- "../../vllm/model_executor/layers/layernorm.py:rms_norm_gated"
---8<-- "../../vllm/model_executor/layers/layernorm.py:gemma_rms_norm"
+```python
+--8<-- "vllm/model_executor/layers/layernorm.py:rms_norm"
+--8<-- "vllm/model_executor/layers/layernorm.py:rms_norm_gated"
+--8<-- "vllm/model_executor/layers/layernorm.py:gemma_rms_norm"
+```
 
 **10. Quantization:**
 
---8<-- "../../vllm/model_executor/layers/quantization/input_quant_fp8.py:quant_fp8"
+```python
+--8<-- "vllm/model_executor/layers/quantization/input_quant_fp8.py:quant_fp8"
+```
 
 **11. Rope:**
 
---8<-- "../../vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding"
---8<-- "../../vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding"
---8<-- "../../vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb"
+```python
+--8<-- "vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding"
+--8<-- "vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding"
+--8<-- "vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb"
+```
 
 ## Guidelines for Implementing a New CustomOp
 

From b0846c1c2375a4e40ac171700f89fffd88fbec75 Mon Sep 17 00:00:00 2001
From: shen-shanshan <467638484@qq.com>
Date: Wed, 24 Dec 2025 07:29:48 +0000
Subject: [PATCH 6/6] update format

Signed-off-by: shen-shanshan <467638484@qq.com>
---
 docs/design/custom_op.md | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md
index 0345e42d07e79..41df8fec084cd 100644
--- a/docs/design/custom_op.md
+++ b/docs/design/custom_op.md
@@ -61,6 +61,7 @@ For example:
 
 ```python
 --8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn"
+
 --8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
 ```
 
@@ -68,15 +69,25 @@ For example:
 
 ```python
 --8<-- "vllm/model_executor/layers/activation.py:silu_and_mul"
+
 --8<-- "vllm/model_executor/layers/activation.py:mul_and_silu"
+
 --8<-- "vllm/model_executor/layers/activation.py:gelu_new"
+
 --8<-- "vllm/model_executor/layers/activation.py:gelu_fast"
+
 --8<-- "vllm/model_executor/layers/activation.py:quick_gelu"
+
 --8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul"
+
 --8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul_sparse"
+
 --8<-- "vllm/model_executor/layers/activation.py:relu2"
+
 --8<-- "vllm/model_executor/layers/activation.py:xielu"
+
 --8<-- "vllm/model_executor/layers/activation.py:swigluoai_and_mul"
+
 --8<-- "vllm/model_executor/layers/activation.py:fatrelu_and_mul"
 ```
 
@@ -84,6 +95,7 @@ For example:
 
 ```python
 --8<-- "vllm/model_executor/layers/conv.py:conv2d"
+
 --8<-- "vllm/model_executor/layers/conv.py:conv3d"
 ```
 
@@ -91,6 +103,7 @@ For example:
 
 ```python
 --8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding"
+
 --8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head"
 ```
 
@@ -98,8 +111,10 @@ For example:
 
 ```python
 --8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear"
---8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear:column_parallel_linear"
---8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear:replicated_linear"
+
+--8<-- "vllm/model_executor/layers/linear.py:column_parallel_linear"
+
+--8<-- "vllm/model_executor/layers/linear.py:replicated_linear"
 ```
 
 **6. Logits Processor:**
@@ -112,9 +127,13 @@ For example:
 
 ```python
 --8<-- "vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer"
+
 --8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2"
+
 --8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm"
+
 --8<-- "vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer"
+
 --8<-- "vllm/model_executor/layers/mamba/short_conv.py:short_conv"
 ```
 
@@ -122,9 +141,13 @@ For example:
 
 ```python
 --8<-- "vllm/model_executor/layers/fused_moe/layer.py:fused_moe"
+
 --8<-- "vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe"
+
 --8<-- "vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe"
+
 --8<-- "vllm/model_executor/models/transformers/moe.py:transformers_fused_moe"
+
 --8<-- "vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk"
 ```
 
@@ -132,7 +155,9 @@ For example:
 
 ```python
 --8<-- "vllm/model_executor/layers/layernorm.py:rms_norm"
+
 --8<-- "vllm/model_executor/layers/layernorm.py:rms_norm_gated"
+
 --8<-- "vllm/model_executor/layers/layernorm.py:gemma_rms_norm"
 ```
 
@@ -146,7 +171,9 @@ For example:
 
 ```python
 --8<-- "vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding"
+
 --8<-- "vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding"
+
 --8<-- "vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb"
 ```