From f0d738f0cc460b14981aab5350b86130c6e7c5ac Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Tue, 23 Dec 2025 09:07:23 +0000 Subject: [PATCH 1/6] add custom op doc Signed-off-by: shen-shanshan <467638484@qq.com> --- docs/design/custom_op.md | 236 +++++++++++++++++++++++++++++++ vllm/config/compilation.py | 3 +- vllm/model_executor/custom_op.py | 9 +- 3 files changed, 244 insertions(+), 4 deletions(-) create mode 100644 docs/design/custom_op.md diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md new file mode 100644 index 0000000000000..fee7f89171b23 --- /dev/null +++ b/docs/design/custom_op.md @@ -0,0 +1,236 @@ +# CustomOp + +`CustomOp` is an abstract class used for dispatching the forward method of various operations to the appropriate backend. It also offers a mechanism for both vLLM and OOT (Out-Of-Tree) plugins to register their custom operations. + +This document will introduce how CustomOp works in vLLM and how to implement a new `CustomOp`. + +## How CustomOp Works in vLLM + +`CustomOp` manages two dictionaries of all custom ops (i.e., op classes, indexed by registered name) in its class, for vLLM and OOT plugins respectively. + +??? code + + ```python + class CustomOp(nn.Module): + + op_registry: dict[str, type["CustomOp"]] = {} + op_registry_oot: dict[str, type["CustomOp"]] = {} + ``` + +We can use `@CustomOp.register("op_name")` to register an op class to the `CustomOp` system. After this, the `op_name` and its class will be added into the `op_registry` dictionary. In addition, We can also register an OOT op by `@CustomOp.register_oot("op_name")`. We will introduce this mechanism in detail later. + +When a `CustomOp` is called (i.e., call its `forward()` method), if it is enabled, it will automatically dispatch the forward method to the appropriate backend according to `current_platform`. Otherwise (i.e., it is disabled), it will only call the `forward_native()` method to use PyTorch-native implementation of this forward method. + +- **CPU platform:** dispatch to `forward_cpu()`. +- **CUDA platform:** dispatch to `forward_cuda()`. +- **ROCm platform:** dispatch to `forward_hip()`. If `forward_hip()` is not implemented, it will use `forward_cuda()` as a fallback. +- **XPU platform:** dispatch to `forward_xpu()`. +- **TPU platform:** dispatch to `forward_tpu()`. +- **OOT platform:** dispatch to `forward_oot()`. This will only be called on OOT platforms. +- **Default:** dispatch to `forward_native()` as a final fallback for all platforms. + +Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`. + +!!! note + Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`. + +By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with graph mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops. + +!!! note + For multi-modal models, vLLM has enforece enabled some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level. + + Note that this `enforce_enable` mechanism will be removed after we adding a separate `compilation_config` for multi-modal part. + +## How to Customise Your Configuration for CustomOp + +vLLM also offers fine-grained control over which custom ops to enable or disable for users, by manually passing a `--compilation_config.custom_ops '["..."]'` when launching a server. + +For example: + +- Use `--compilation_config.custom_ops '["all"]'` to enable all custom ops. +- Use `--compilation_config.custom_ops '["none"]'` to disable all custom ops. +- Use `--compilation_config.custom_ops '["all,-op1"]'` to enable all custom ops except op1 (i.e., prefixed with a `-` means "disable"). +- Use `--compilation_config.custom_ops '["none,+op1,+op2"]'` to only enable op1 and op2 (i.e., prefixed with a `+` means "enable"). + +## Types of Supported CustomOp in vLLM + +| Category | OP Name | OP Class | +|----------|---------|----------| +| Attention | `mm_encoder_attn` | `MMEncoderAttention` | +| Attention | `multi_head_latent_attention` | `MultiHeadLatentAttentionWrapper` | +| Activation | `fatrelu_and_mul` | `FatreluAndMul` | +| Activation | `silu_and_mul` | `SiluAndMul` | +| Activation | `mul_and_silu` | `MulAndSilu` | +| Activation | `gelu_and_mul_sparse` | `GeluAndMulSparse` | +| Activation | `gelu_and_mul` | `GeluAndMul` | +| Activation | `swigluoai_and_mul` | `SwigluOAIAndMul` | +| Activation | `gelu_new` | `NewGELU` | +| Activation | `gelu_fast` | `FastGELU` | +| Activation | `quick_gelu` | `QuickGELU` | +| Activation | `relu2` | `ReLUSquaredActivation` | +| Activation | `xielu` | `XIELU` | +| Conv | `conv2d` | `Conv2dLayer` | +| Conv | `conv3d` | `Conv3dLayer` | +| Conv | `short_conv` | `ShortConv` | +| Embedding | `vocab_parallel_embedding` | `VocabParallelEmbedding` | +| Embedding | `parallel_lm_head` | `ParallelLMHead` | +| Linear | `row_parallel_linear` | `RowParallelLinear` | +| Linear | `column_parallel_linear` | `ColumnParallelLinear` | +| Linear | `replicated_linear` | `ReplicatedLinear` | +| Logits Processor | `logits_processor` | `LogitsProcessor` | +| Mamba | `mamba_mixer` | `MambaMixer` | +| Mamba | `mamba_mixer2` | `MambaMixer2` | +| Mamba | `plamo2_mamba_mixer` | `Plamo2MambaMixer` | +| Mamba | `mixer2_gated_rms_norm` | `Mixer2RMSNormGated` | +| MoE | `fused_moe` | `FusedMoE` | +| MoE | `modular_fused_moe` | `FusedMoEModularMethod` | +| MoE | `unquantized_fused_moe` | `UnquantizedFusedMoEMethod` | +| MoE | `transformers_fused_moe` | `TransformersFusedMoE` | +| MoE | `grouped_topk` | `GroupedTopk` | +| Norm | `rms_norm` | `RMSNorm` | +| Norm | `gemma_rms_norm` | `GemmaRMSNorm` | +| Norm | `rms_norm_gated` | `RMSNormGated` | +| Quantization | `quant_fp8` | `QuantFP8` | +| Rope | `rotary_embedding` | `RotaryEmbeddingBase` | +| Rope | `dual_chunk_rotary_embedding` | `DualChunkRotaryEmbedding` | +| Rope | `apply_rotary_emb` | `ApplyRotaryEmb` | + +## Guidelines for Implementing a New CustomOp + +### Implement a New CustomOp in vLLM + +This part is a tutorial of how to implement a New `CustomOp` in vLLM. + +Steps: + +1. Implement a new op class, which extends from `CustomOp` base class. +2. Add the `@CustomOp.register("op_name")` decorator on this op class to register it into `CustomOp` system. +3. Implement different `forward_xxx()` method according to your needs. + +Taking `MMEncoderAttention` as an example: + +??? code + + ```python + @CustomOp.register("mm_encoder_attn") + class MMEncoderAttention(CustomOp): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float | None = None, + num_kv_heads: int | None = None, + prefix: str = "", + multimodal_config: MultiModalConfig | None = None, + ) -> None: + super().__init__() + # Init... + + def forward_native( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + ) -> torch.Tensor: + # Call TORCH_SDPA implementation... + + def forward_cuda( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + ) -> torch.Tensor: + # Call FA or TORCH_SDPA implementation... + + def forward_cpu( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + ) -> torch.Tensor: + # Call TORCH_SDPA implementation... + + def forward_xpu( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + ) -> torch.Tensor: + # Call FA implementation... + + def forward_tpu( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention + ) -> torch.Tensor: + # Call PALLAS implementation... + ``` + +### Register a New CustomOp in OOT Device Plugins + +Currently, thanks to [vLLM's hardware-plugin mechanism](./plugin_system.md), there are various OOT device plugins emerging out to enable vLLM seamlessly runs on different hardwares. You can also find more details about this mechanism at [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html). + +- **Official device plugins:** [vllm-ascend](https://github.com/vllm-project/vllm-ascend) (for Huawei Ascend NPU), [vllm-spyre](https://github.com/vllm-project/vllm-spyre) +(for Spyre), [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi) (for Intel Gaudi), [vllm-neuron](https://github.com/vllm-project/vllm-neuron) (for AWS Neuron), [vllm-meta](https://github.com/vllm-project/vllm-metal) (for Apple Silicon), etc. +- **Non-official device plugins:** [vllm-metax](https://github.com/MetaX-MACA/vLLM-metax) (for MetaX GPU), [vllm-kunlun](https://github.com/baidu/vLLM-Kunlun) (for Baidu Kunlun XPU), etc. + +In this case, `CustomOp` can enable these hardware manufacturers to seamlessly replace vLLM's operations with their deep-optimized kernels for specific devices at runtime, by just registering an OOT `CustomOp` and implementing the `forward_oot()` method. + +Now, this part will show you how to register an OOT `CustomOp` for a device plugin. + +Taking `MMEncoderAttention` as an example: + +1. Implement a `CustomMMEncoderAttention` class which extends from `MMEncoderAttention` and implement its `forward_oot()` method. +2. Register your `CustomMMEncoderAttention` into vLLM to replace `MMEncoderAttention`. + +??? code + + ```python + from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention + from vllm.model_executor.custom_op import CustomOp + + + @CustomOp.register_oot("MMEncoderAttention") + class CustomMMEncoderAttention(MMEncoderAttention): + + def __init__(...): + super().__init__(...) + + def forward_oot(...): + # Call optimized device-specific kernels. + ... + ``` + +In this case, a new item `{"MMEncoderAttention": CustomMMEncoderAttention}` will be added into `op_registry_oot`. When initializing a `MMEncoderAttention` op object, if the class name (i.e., `MMEncoderAttention`) is contained in the keys of `op_registry_oot`, vLLM will replace it with our registered class (i.e., `CustomMMEncoderAttention`) and instantiate it. + +After that, when this `MMEncoderAttention` op is called, your `forward_oot()` will be called if it is enabled. Thus, you will get expected performance on your hardwares without directly modify vLLM. + +In addition, you can also register all your `CustomOp` at one place for better management. + +??? code + + ```python + from vllm.model_executor.custom_op import CustomOp + + + REGISTERED_CUSTOM_OPS = { + "CustomOP1": YourCustomOp1, + "CustomOP2": YourCustomOp2, + "CustomOP3": YourCustomOp3, + } + + for op_name, op_cls in REGISTERED_CUSTOM_OPS.items(): + CustomOp.register_oot(_decorated_op_cls=op_cls, name=op_name) + ``` diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index cd527e4198557..56e69541e6b81 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -404,7 +404,8 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor". + disabled when running with Inductor: mode>=CompilationMode.NONE and + backend="inductor". Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None """A list of ops to exclude from cudagraphs, used in piecewise compilation. diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 66250f816f459..371b691759348 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -86,9 +86,12 @@ class CustomOp(nn.Module): # specific backend. Currently, we do not support dynamic dispatching. compilation_config = get_cached_compilation_config() - # CustomOp object can be enforce enabled, e.g., enable device-specific - # kernels in ViT models when enabling graph mode. By default, it will - # follow the compilation_config to determine whether enable itself. + # NOTE(shen-shanshan): CustomOp object can be enforce enabled, e.g., + # enable device-specific kernels in ViT models when enabling graph + # mode. By default, it will follow the compilation_config to determine + # whether enable itself. + # This enforce_enable mechanism will be removed after we adding a + # separate compilation_config for multi-modal part. enabled = self._enforce_enable or self.enabled() if enabled: compilation_config.enabled_custom_ops.update([self.__class__.name]) From e391c275a18a9063ccb0c9720fde4de7c360e17e Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Tue, 23 Dec 2025 09:44:50 +0000 Subject: [PATCH 2/6] fix lint Signed-off-by: shen-shanshan <467638484@qq.com> --- docs/design/custom_op.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index fee7f89171b23..60716d4f88246 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -29,7 +29,7 @@ When a `CustomOp` is called (i.e., call its `forward()` method), if it is enable - **OOT platform:** dispatch to `forward_oot()`. This will only be called on OOT platforms. - **Default:** dispatch to `forward_native()` as a final fallback for all platforms. -Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`. +Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`. !!! note Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`. @@ -38,7 +38,7 @@ By default, if `compilation_config.backend == "inductor"` and `compilation_confi !!! note For multi-modal models, vLLM has enforece enabled some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level. - + Note that this `enforce_enable` mechanism will be removed after we adding a separate `compilation_config` for multi-modal part. ## How to Customise Your Configuration for CustomOp From e1c9d6e7e09903a09359f7f670d5973fb924d4ac Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Wed, 24 Dec 2025 02:06:03 +0000 Subject: [PATCH 3/6] fix Signed-off-by: shen-shanshan <467638484@qq.com> --- docs/design/custom_op.md | 5 ++++- vllm/config/compilation.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index 60716d4f88246..64822412c93fa 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -29,12 +29,15 @@ When a `CustomOp` is called (i.e., call its `forward()` method), if it is enable - **OOT platform:** dispatch to `forward_oot()`. This will only be called on OOT platforms. - **Default:** dispatch to `forward_native()` as a final fallback for all platforms. +!!! note + Note that the dispatching logic might not be absolute because of class inheritance. Derived class might override the behavior. + Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`. !!! note Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`. -By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with graph mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops. +By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with torch compile mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops. !!! note For multi-modal models, vLLM has enforece enabled some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level. diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 56e69541e6b81..7e3beaa40e5a0 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -404,7 +404,7 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: mode>=CompilationMode.NONE and + disabled when running with Inductor: mode>CompilationMode.NONE and backend="inductor". Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None From a2c7852e4c3577ddc0947d50ba3551c4f3d2a302 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Wed, 24 Dec 2025 03:21:23 +0000 Subject: [PATCH 4/6] use snippest Signed-off-by: shen-shanshan <467638484@qq.com> --- docs/design/custom_op.md | 110 +++++++++++------- vllm/attention/layers/mm_encoder_attention.py | 3 + vllm/model_executor/layers/activation.py | 33 ++++++ vllm/model_executor/layers/conv.py | 6 + .../layers/fused_moe/fused_moe.py | 3 + .../fused_moe/fused_moe_modular_method.py | 3 + vllm/model_executor/layers/fused_moe/layer.py | 3 + .../fused_moe/unquantized_fused_moe_method.py | 3 + vllm/model_executor/layers/layernorm.py | 9 ++ vllm/model_executor/layers/linear.py | 9 ++ .../model_executor/layers/logits_processor.py | 3 + .../layers/mamba/mamba_mixer.py | 3 + .../layers/mamba/mamba_mixer2.py | 6 + .../model_executor/layers/mamba/short_conv.py | 3 + vllm/model_executor/layers/mla.py | 3 + .../layers/quantization/input_quant_fp8.py | 3 + .../layers/rotary_embedding/base.py | 3 + .../layers/rotary_embedding/common.py | 3 + .../rotary_embedding/dual_chunk_rope.py | 3 + .../layers/vocab_parallel_embedding.py | 6 + vllm/model_executor/models/plamo2.py | 5 +- .../model_executor/models/transformers/moe.py | 3 + 22 files changed, 185 insertions(+), 41 deletions(-) diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index 64822412c93fa..bf386b94e92c6 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -57,46 +57,76 @@ For example: ## Types of Supported CustomOp in vLLM -| Category | OP Name | OP Class | -|----------|---------|----------| -| Attention | `mm_encoder_attn` | `MMEncoderAttention` | -| Attention | `multi_head_latent_attention` | `MultiHeadLatentAttentionWrapper` | -| Activation | `fatrelu_and_mul` | `FatreluAndMul` | -| Activation | `silu_and_mul` | `SiluAndMul` | -| Activation | `mul_and_silu` | `MulAndSilu` | -| Activation | `gelu_and_mul_sparse` | `GeluAndMulSparse` | -| Activation | `gelu_and_mul` | `GeluAndMul` | -| Activation | `swigluoai_and_mul` | `SwigluOAIAndMul` | -| Activation | `gelu_new` | `NewGELU` | -| Activation | `gelu_fast` | `FastGELU` | -| Activation | `quick_gelu` | `QuickGELU` | -| Activation | `relu2` | `ReLUSquaredActivation` | -| Activation | `xielu` | `XIELU` | -| Conv | `conv2d` | `Conv2dLayer` | -| Conv | `conv3d` | `Conv3dLayer` | -| Conv | `short_conv` | `ShortConv` | -| Embedding | `vocab_parallel_embedding` | `VocabParallelEmbedding` | -| Embedding | `parallel_lm_head` | `ParallelLMHead` | -| Linear | `row_parallel_linear` | `RowParallelLinear` | -| Linear | `column_parallel_linear` | `ColumnParallelLinear` | -| Linear | `replicated_linear` | `ReplicatedLinear` | -| Logits Processor | `logits_processor` | `LogitsProcessor` | -| Mamba | `mamba_mixer` | `MambaMixer` | -| Mamba | `mamba_mixer2` | `MambaMixer2` | -| Mamba | `plamo2_mamba_mixer` | `Plamo2MambaMixer` | -| Mamba | `mixer2_gated_rms_norm` | `Mixer2RMSNormGated` | -| MoE | `fused_moe` | `FusedMoE` | -| MoE | `modular_fused_moe` | `FusedMoEModularMethod` | -| MoE | `unquantized_fused_moe` | `UnquantizedFusedMoEMethod` | -| MoE | `transformers_fused_moe` | `TransformersFusedMoE` | -| MoE | `grouped_topk` | `GroupedTopk` | -| Norm | `rms_norm` | `RMSNorm` | -| Norm | `gemma_rms_norm` | `GemmaRMSNorm` | -| Norm | `rms_norm_gated` | `RMSNormGated` | -| Quantization | `quant_fp8` | `QuantFP8` | -| Rope | `rotary_embedding` | `RotaryEmbeddingBase` | -| Rope | `dual_chunk_rotary_embedding` | `DualChunkRotaryEmbedding` | -| Rope | `apply_rotary_emb` | `ApplyRotaryEmb` | +**1. Attention:** + +--8<-- "../../vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn" +--8<-- "../../vllm/model_executor/layers/mla.py:multi_head_latent_attention" + +**2. Activation:** + +--8<-- "../../vllm/model_executor/layers/activation.py:silu_and_mul" +--8<-- "../../vllm/model_executor/layers/activation.py:mul_and_silu" +--8<-- "../../vllm/model_executor/layers/activation.py:gelu_new" +--8<-- "../../vllm/model_executor/layers/activation.py:gelu_fast" +--8<-- "../../vllm/model_executor/layers/activation.py:quick_gelu" +--8<-- "../../vllm/model_executor/layers/activation.py:gelu_and_mul" +--8<-- "../../vllm/model_executor/layers/activation.py:gelu_and_mul_sparse" +--8<-- "../../vllm/model_executor/layers/activation.py:relu2" +--8<-- "../../vllm/model_executor/layers/activation.py:xielu" +--8<-- "../../vllm/model_executor/layers/activation.py:swigluoai_and_mul" +--8<-- "../../vllm/model_executor/layers/activation.py:fatrelu_and_mul" + +**3. MM-Conv:** + +--8<-- "../../vllm/model_executor/layers/conv.py:conv2d" +--8<-- "../../vllm/model_executor/layers/conv.py:conv3d" + +**4. Embedding:** + +--8<-- "../../vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding" +--8<-- "../../vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head" + +**5. Linear:** + +--8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear" +--8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear:column_parallel_linear" +--8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear:replicated_linear" + +**6. Logits Processor:** + +--8<-- "../../vllm/model_executor/layers/logits_processor.py:logits_processor" + +**7. Mamba:** + +--8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer" +--8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2" +--8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm" +--8<-- "../../vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer" +--8<-- "../../vllm/model_executor/layers/mamba/short_conv.py:short_conv" + +**8. MoE:** + +--8<-- "../../vllm/model_executor/layers/fused_moe/layer.py:fused_moe" +--8<-- "../../vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe" +--8<-- "../../vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe" +--8<-- "../../vllm/model_executor/models/transformers/moe.py:transformers_fused_moe" +--8<-- "../../vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk" + +**9. Norm:** + +--8<-- "../../vllm/model_executor/layers/layernorm.py:rms_norm" +--8<-- "../../vllm/model_executor/layers/layernorm.py:rms_norm_gated" +--8<-- "../../vllm/model_executor/layers/layernorm.py:gemma_rms_norm" + +**10. Quantization:** + +--8<-- "../../vllm/model_executor/layers/quantization/input_quant_fp8.py:quant_fp8" + +**11. Rope:** + +--8<-- "../../vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding" +--8<-- "../../vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding" +--8<-- "../../vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb" ## Guidelines for Implementing a New CustomOp diff --git a/vllm/attention/layers/mm_encoder_attention.py b/vllm/attention/layers/mm_encoder_attention.py index 25f54cc867b5a..44c3d646f7c46 100644 --- a/vllm/attention/layers/mm_encoder_attention.py +++ b/vllm/attention/layers/mm_encoder_attention.py @@ -18,10 +18,13 @@ from vllm.model_executor.models.vision import get_vit_attn_backend logger = init_logger(__name__) +# --8<-- [start:mm_encoder_attn] @CustomOp.register("mm_encoder_attn") class MMEncoderAttention(CustomOp): """Multi-headed attention without any cache, used for multimodal encoder.""" + # --8<-- [end:mm_encoder_attn] + def __init__( self, num_heads: int, diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 7038d0868c7eb..5e904b9070cc1 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -22,6 +22,7 @@ from vllm.utils.collection_utils import LazyDict logger = init_logger(__name__) +# --8<-- [start:fatrelu_and_mul] @CustomOp.register("fatrelu_and_mul") class FatreluAndMul(CustomOp): """An activation function for FATReLU. @@ -35,6 +36,8 @@ class FatreluAndMul(CustomOp): return: (num_tokens, d) or (batch_size, seq_len, d) """ + # --8<-- [end:fatrelu_and_mul] + def __init__(self, threshold: float = 0.0): super().__init__() self.threshold = threshold @@ -58,6 +61,7 @@ class FatreluAndMul(CustomOp): return out +# --8<-- [start:silu_and_mul] @CustomOp.register("silu_and_mul") class SiluAndMul(CustomOp): """An activation function for SwiGLU. @@ -69,6 +73,8 @@ class SiluAndMul(CustomOp): return: (num_tokens, d) or (batch_size, seq_len, d) """ + # --8<-- [end:silu_and_mul] + def __init__(self): super().__init__() if current_platform.is_cuda_alike(): @@ -101,6 +107,7 @@ class SiluAndMul(CustomOp): return out +# --8<-- [start:mul_and_silu] @CustomOp.register("mul_and_silu") class MulAndSilu(CustomOp): """An activation function for SwiGLU. @@ -112,6 +119,8 @@ class MulAndSilu(CustomOp): return: (num_tokens, d) or (batch_size, seq_len, d) """ + # --8<-- [end:mul_and_silu] + def __init__(self): super().__init__() if current_platform.is_cuda_alike(): @@ -139,6 +148,7 @@ class MulAndSilu(CustomOp): # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: +# --8<-- [start:gelu_and_mul_sparse] @CustomOp.register("gelu_and_mul_sparse") class GeluAndMulSparse(CustomOp): """An activation function for GeluAndMulSparse. @@ -153,6 +163,8 @@ class GeluAndMulSparse(CustomOp): return: (num_tokens, d) or (batch_size, seq_len, d) """ + # --8<-- [end:gelu_and_mul_sparse] + def __init__(self, activation_sparsity: float, approximate: str = "none"): super().__init__() # Gelu. @@ -195,6 +207,7 @@ class GeluAndMulSparse(CustomOp): return self.forward_native(x) +# --8<-- [start:gelu_and_mul] @CustomOp.register("gelu_and_mul") class GeluAndMul(CustomOp): """An activation function for GeGLU. @@ -206,6 +219,8 @@ class GeluAndMul(CustomOp): return: (batch_size, seq_len, d) or (num_tokens, d) """ + # --8<-- [end:gelu_and_mul] + def __init__(self, approximate: str = "none"): super().__init__() self.approximate = approximate @@ -257,9 +272,12 @@ class GeluAndMul(CustomOp): return f"approximate={repr(self.approximate)}" +# --8<-- [start:swigluoai_and_mul] @CustomOp.register("swigluoai_and_mul") class SwigluOAIAndMul(CustomOp): # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110 + # --8<-- [end:swigluoai_and_mul] + def __init__(self, alpha: float = 1.702, limit: float = 7.0): super().__init__() self.alpha = alpha @@ -286,8 +304,11 @@ class SwigluOAIAndMul(CustomOp): return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}" +# --8<-- [start:gelu_new] @CustomOp.register("gelu_new") class NewGELU(CustomOp): + # --8<-- [end:gelu_new] + def __init__(self): super().__init__() if current_platform.is_cuda_alike() or current_platform.is_cpu(): @@ -311,8 +332,11 @@ class NewGELU(CustomOp): return self.op(x) +# --8<-- [start:gelu_fast] @CustomOp.register("gelu_fast") class FastGELU(CustomOp): + # --8<-- [end:gelu_fast] + def __init__(self): super().__init__() if current_platform.is_cuda_alike() or current_platform.is_cpu(): @@ -335,9 +359,12 @@ class FastGELU(CustomOp): return self.op(x) +# --8<-- [start:quick_gelu] @CustomOp.register("quick_gelu") class QuickGELU(CustomOp): # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90 + # --8<-- [end:quick_gelu] + def __init__(self): super().__init__() if current_platform.is_cuda_alike() or current_platform.is_cpu(): @@ -365,12 +392,15 @@ class QuickGELU(CustomOp): # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: +# --8<-- [start:relu2] @CustomOp.register("relu2") class ReLUSquaredActivation(CustomOp): """ Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2 """ + # --8<-- [end:relu2] + def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" return torch.square(F.relu(x)) @@ -380,6 +410,7 @@ class ReLUSquaredActivation(CustomOp): return self.forward_native(x) +# --8<-- [start:xielu] @CustomOp.register("xielu") class XIELU(CustomOp): """ @@ -388,6 +419,8 @@ class XIELU(CustomOp): Otherwise, we emit a single warning and use xIELU Python """ + # --8<-- [end:xielu] + def __init__( self, alpha_p_init: float = 0.8, diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py index 1cd02698b3863..f4709f2f4d80f 100644 --- a/vllm/model_executor/layers/conv.py +++ b/vllm/model_executor/layers/conv.py @@ -105,10 +105,13 @@ class ConvLayerBase(CustomOp): return s +# --8<-- [start:conv2d] @CustomOp.register("conv2d") class Conv2dLayer(ConvLayerBase): """Conv layer with Conv2d.""" + # --8<-- [end:conv2d] + num_dim = 2 def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: @@ -204,10 +207,13 @@ class CausalConv2dLayer(Conv2dLayer): return x +# --8<-- [start:conv3d] @CustomOp.register("conv3d") class Conv3dLayer(ConvLayerBase): """Conv layer with Conv3d.""" + # --8<-- [end:conv3d] + num_dim = 3 def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index bf51554341607..fae093b6c72f3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1283,10 +1283,13 @@ def grouped_topk( return topk_weights.to(torch.float32), topk_ids.to(torch.int32) +# --8<-- [start:grouped_topk] @CustomOp.register("grouped_topk") class GroupedTopk(CustomOp): """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model.""" + # --8<-- [end:grouped_topk] + def __init__( self, topk: int, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 30ff1bf2f008a..c930a63a2c1ac 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -20,8 +20,11 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( logger = init_logger(__name__) +# --8<-- [start:modular_fused_moe] @CustomOp.register("modular_fused_moe") class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): + # --8<-- [end:modular_fused_moe] + def __init__( self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel ): diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2e7267d56d838..f54d942683d02 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -297,6 +297,7 @@ def maybe_roundup_hidden_size( return hidden_size +# --8<-- [start:fused_moe] @CustomOp.register("fused_moe") class FusedMoE(CustomOp): """FusedMoE layer for MoE models. @@ -320,6 +321,8 @@ class FusedMoE(CustomOp): enable_eplb: Whether to enable expert parallelism load balancer. """ + # --8<-- [end:fused_moe] + def __init__( self, num_experts: int, # Global number of experts diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 82dbccf3fa9da..85a7782fbf497 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -46,10 +46,13 @@ else: logger = init_logger(__name__) +# --8<-- [start:unquantized_fused_moe] @CustomOp.register("unquantized_fused_moe") class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" + # --8<-- [end:unquantized_fused_moe] + def __init__(self, moe: FusedMoEConfig): super().__init__(moe) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 8cc374ac9155d..d962ab8bbc22d 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -88,6 +88,7 @@ def dispatch_rocm_rmsnorm_func( return rms_norm +# --8<-- [start:rms_norm] @CustomOp.register("rms_norm") class RMSNorm(CustomOp): """Root mean square normalization. @@ -96,6 +97,8 @@ class RMSNorm(CustomOp): Refer to https://arxiv.org/abs/1910.07467 """ + # --8<-- [end:rms_norm] + def __init__( self, hidden_size: int, @@ -253,6 +256,7 @@ class RMSNorm(CustomOp): return s +# --8<-- [start:gemma_rms_norm] @CustomOp.register("gemma_rms_norm") class GemmaRMSNorm(CustomOp): """RMS normalization for Gemma. @@ -262,6 +266,8 @@ class GemmaRMSNorm(CustomOp): 2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w. """ + # --8<-- [end:gemma_rms_norm] + def __init__( self, hidden_size: int, @@ -321,6 +327,7 @@ class GemmaRMSNorm(CustomOp): return self.forward_native(x, residual) +# --8<-- [start:rms_norm_gated] @CustomOp.register("rms_norm_gated") class RMSNormGated(CustomOp): """RMS Normalization with optional gating. @@ -331,6 +338,8 @@ class RMSNormGated(CustomOp): - Optional gating with SiLU activation """ + # --8<-- [end:rms_norm_gated] + def __init__( self, hidden_size: int, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 402f0bf69ceaa..e4131a5994bb1 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -296,6 +296,7 @@ class LinearBase(CustomOp): param.tp_size = self.tp_size +# --8<-- [start:replicated_linear] @CustomOp.register("replicated_linear") class ReplicatedLinear(LinearBase): """Replicated linear layer. @@ -313,6 +314,8 @@ class ReplicatedLinear(LinearBase): disable_tp: Take no effect for replicated linear layers. """ + # --8<-- [end:replicated_linear] + def __init__( self, input_size: int, @@ -413,6 +416,7 @@ class ReplicatedLinear(LinearBase): return s +# --8<-- [start:column_parallel_linear] @CustomOp.register("column_parallel_linear") class ColumnParallelLinear(LinearBase): """Linear layer with column parallelism. @@ -440,6 +444,8 @@ class ColumnParallelLinear(LinearBase): disable_tp: If true, weights matrix won't be sharded through tp rank. """ + # --8<-- [end:column_parallel_linear] + def __init__( self, input_size: int, @@ -1276,6 +1282,7 @@ class QKVParallelLinear(ColumnParallelLinear): param_data.copy_(loaded_weight) +# --8<-- [start:row_parallel_linear] @CustomOp.register("row_parallel_linear") class RowParallelLinear(LinearBase): """Linear layer with row parallelism. @@ -1310,6 +1317,8 @@ class RowParallelLinear(LinearBase): disable_tp: If true, weights matrix won't be sharded through tp rank. """ + # --8<-- [end:row_parallel_linear] + def __init__( self, input_size: int, diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index c8d57f597d1ca..38753b0fcc748 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -13,6 +13,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmb from vllm.platforms import current_platform +# --8<-- [start:logits_processor] @CustomOp.register("logits_processor") class LogitsProcessor(CustomOp): """Process logits and apply logits processors from sampling metadata. @@ -23,6 +24,8 @@ class LogitsProcessor(CustomOp): 3. Apply logits processors (if any). """ + # --8<-- [end:logits_processor] + def __init__( self, vocab_size: int, diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 0b63acf2dc5a5..0fa11cf02eede 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -39,6 +39,7 @@ from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer +# --8<-- [start:mamba_mixer] @CustomOp.register("mamba_mixer") class MambaMixer(MambaBase, CustomOp): """ @@ -51,6 +52,8 @@ class MambaMixer(MambaBase, CustomOp): **selective** state spaces) """ + # --8<-- [end:mamba_mixer] + def __init__( self, hidden_size: int, diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 0ea5805305eda..875bc9019fbac 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -49,8 +49,11 @@ from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata # Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated +# --8<-- [start:mixer2_gated_rms_norm] @CustomOp.register("mixer2_gated_rms_norm") class Mixer2RMSNormGated(CustomOp): + # --8<-- [end:mixer2_gated_rms_norm] + def __init__( self, full_hidden_size: int, @@ -214,6 +217,7 @@ def mamba_v2_sharded_weight_loader( # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer +# --8<-- [start:mamba_mixer2] @CustomOp.register("mamba_mixer2") class MambaMixer2(MambaBase, CustomOp): """ @@ -226,6 +230,8 @@ class MambaMixer2(MambaBase, CustomOp): **selective** state spaces) """ + # --8<-- [end:mamba_mixer2] + def __init__( self, hidden_size: int, diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py index 0bbad17d7ebc7..af9cd4b6d39b8 100644 --- a/vllm/model_executor/layers/mamba/short_conv.py +++ b/vllm/model_executor/layers/mamba/short_conv.py @@ -27,8 +27,11 @@ from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata +# --8<-- [start:short_conv] @CustomOp.register("short_conv") class ShortConv(MambaBase, CustomOp): + # --8<-- [end:short_conv] + def __init__( self, config, diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py index 1656f4deb6717..65541d2a485a8 100644 --- a/vllm/model_executor/layers/mla.py +++ b/vllm/model_executor/layers/mla.py @@ -29,6 +29,7 @@ class MLAModules: indexer_rotary_emb: torch.nn.Module | None = None +# --8<-- [start:multi_head_latent_attention] @CustomOp.register("multi_head_latent_attention") class MultiHeadLatentAttentionWrapper(CustomOp): """MLA layer registered as CustomOp to allow OOT backends to add @@ -47,6 +48,8 @@ class MultiHeadLatentAttentionWrapper(CustomOp): 3. Return the output tensor. """ + # --8<-- [end:multi_head_latent_attention] + def __init__( self, hidden_size: int, diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index 7994c838ad548..6f43eac14e8c7 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -19,6 +19,7 @@ _FP8_MIN = -224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.min _FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0) +# --8<-- [start:quant_fp8] @CustomOp.register("quant_fp8") class QuantFP8(CustomOp): """ @@ -26,6 +27,8 @@ class QuantFP8(CustomOp): This CustomOp supports both static and dynamic quantization. """ + # --8<-- [end:quant_fp8] + def __init__( self, static: bool, diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 7e83ea9a1355b..155d6c5541b50 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -10,10 +10,13 @@ from vllm.model_executor.custom_op import CustomOp from .common import ApplyRotaryEmb +# --8<-- [start:rotary_embedding] @CustomOp.register("rotary_embedding") class RotaryEmbeddingBase(CustomOp): """Original rotary positional embedding.""" + # --8<-- [end:rotary_embedding] + def __init__( self, head_size: int, diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py index b86cd9f001d61..dba19471eb618 100644 --- a/vllm/model_executor/layers/rotary_embedding/common.py +++ b/vllm/model_executor/layers/rotary_embedding/common.py @@ -118,8 +118,11 @@ direct_register_custom_op( ) +# --8<-- [start:apply_rotary_emb] @CustomOp.register("apply_rotary_emb") class ApplyRotaryEmb(CustomOp): + # --8<-- [end:apply_rotary_emb] + def __init__( self, enforce_enable: bool = False, diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py index b5dd94cc7f531..e5dabe035b34e 100644 --- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py @@ -9,10 +9,13 @@ from vllm.model_executor.custom_op import CustomOp from .common import rotate_gptj, rotate_neox +# --8<-- [start:dual_chunk_rotary_embedding] @CustomOp.register("dual_chunk_rotary_embedding") class DualChunkRotaryEmbedding(CustomOp): """Rotary positional embedding for Dual Chunk Attention.""" + # --8<-- [end:dual_chunk_rotary_embedding] + def __init__( self, head_size: int, diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 1abc3ad884550..daaa86bed4786 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -181,6 +181,7 @@ def get_masked_input_and_mask( return input_, ~vocab_mask +# --8<-- [start:vocab_parallel_embedding] @CustomOp.register("vocab_parallel_embedding") class VocabParallelEmbedding(CustomOp): """Embedding parallelized in the vocabulary dimension. @@ -221,6 +222,8 @@ class VocabParallelEmbedding(CustomOp): prefix: full name of the layer in the state dict """ # noqa: E501 + # --8<-- [end:vocab_parallel_embedding] + def __init__( self, num_embeddings: int, @@ -492,6 +495,7 @@ class VocabParallelEmbedding(CustomOp): return s +# --8<-- [start:parallel_lm_head] @CustomOp.register("parallel_lm_head") class ParallelLMHead(VocabParallelEmbedding): """Parallelized LM head. @@ -509,6 +513,8 @@ class ParallelLMHead(VocabParallelEmbedding): padding_size: padding size for the vocabulary. """ + # --8<-- [end:parallel_lm_head] + def __init__( self, num_embeddings: int, diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 6765ee0c5779c..b957abd2e8760 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -97,8 +97,11 @@ def is_mamba(config: Plamo2Config, i: int) -> bool: # Adapted from: # vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2 # transformers.models.mamba.modeling_mamba.MambaMixer -@CustomOp.register(name="plamo2_mamba_mixer") +# --8<-- [start:plamo2_mamba_mixer] +@CustomOp.register("plamo2_mamba_mixer") class Plamo2MambaMixer(MambaBase, CustomOp): + # --8<-- [end:plamo2_mamba_mixer] + def __init__(self, vllm_config: VllmConfig, *, prefix: str = "", **kwargs) -> None: super().__init__() self.config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index 31db9d682bd40..46bd475c01323 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -37,10 +37,13 @@ if TYPE_CHECKING: from vllm.config import VllmConfig +# --8<-- [start:transformers_fused_moe] @CustomOp.register("transformers_fused_moe") class TransformersFusedMoE(FusedMoE): """Custom FusedMoE for the Transformers modeling backend.""" + # --8<-- [end:transformers_fused_moe] + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._topk_ids: torch.Tensor = None From 7fdd6a11fae521326bcbf7b836fda3b0f6a45bc9 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Wed, 24 Dec 2025 06:41:36 +0000 Subject: [PATCH 5/6] update Signed-off-by: shen-shanshan <467638484@qq.com> --- docs/design/custom_op.md | 98 ++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 38 deletions(-) diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index bf386b94e92c6..0345e42d07e79 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -59,74 +59,96 @@ For example: **1. Attention:** ---8<-- "../../vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn" ---8<-- "../../vllm/model_executor/layers/mla.py:multi_head_latent_attention" +```python +--8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn" +--8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention" +``` **2. Activation:** ---8<-- "../../vllm/model_executor/layers/activation.py:silu_and_mul" ---8<-- "../../vllm/model_executor/layers/activation.py:mul_and_silu" ---8<-- "../../vllm/model_executor/layers/activation.py:gelu_new" ---8<-- "../../vllm/model_executor/layers/activation.py:gelu_fast" ---8<-- "../../vllm/model_executor/layers/activation.py:quick_gelu" ---8<-- "../../vllm/model_executor/layers/activation.py:gelu_and_mul" ---8<-- "../../vllm/model_executor/layers/activation.py:gelu_and_mul_sparse" ---8<-- "../../vllm/model_executor/layers/activation.py:relu2" ---8<-- "../../vllm/model_executor/layers/activation.py:xielu" ---8<-- "../../vllm/model_executor/layers/activation.py:swigluoai_and_mul" ---8<-- "../../vllm/model_executor/layers/activation.py:fatrelu_and_mul" +```python +--8<-- "vllm/model_executor/layers/activation.py:silu_and_mul" +--8<-- "vllm/model_executor/layers/activation.py:mul_and_silu" +--8<-- "vllm/model_executor/layers/activation.py:gelu_new" +--8<-- "vllm/model_executor/layers/activation.py:gelu_fast" +--8<-- "vllm/model_executor/layers/activation.py:quick_gelu" +--8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul" +--8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul_sparse" +--8<-- "vllm/model_executor/layers/activation.py:relu2" +--8<-- "vllm/model_executor/layers/activation.py:xielu" +--8<-- "vllm/model_executor/layers/activation.py:swigluoai_and_mul" +--8<-- "vllm/model_executor/layers/activation.py:fatrelu_and_mul" +``` **3. MM-Conv:** ---8<-- "../../vllm/model_executor/layers/conv.py:conv2d" ---8<-- "../../vllm/model_executor/layers/conv.py:conv3d" +```python +--8<-- "vllm/model_executor/layers/conv.py:conv2d" +--8<-- "vllm/model_executor/layers/conv.py:conv3d" +``` **4. Embedding:** ---8<-- "../../vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding" ---8<-- "../../vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head" +```python +--8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding" +--8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head" +``` **5. Linear:** ---8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear" ---8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear:column_parallel_linear" ---8<-- "../../vllm/model_executor/layers/linear.py:row_parallel_linear:replicated_linear" +```python +--8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear" +--8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear:column_parallel_linear" +--8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear:replicated_linear" +``` **6. Logits Processor:** ---8<-- "../../vllm/model_executor/layers/logits_processor.py:logits_processor" +```python +--8<-- "vllm/model_executor/layers/logits_processor.py:logits_processor" +``` **7. Mamba:** ---8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer" ---8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2" ---8<-- "../../vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm" ---8<-- "../../vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer" ---8<-- "../../vllm/model_executor/layers/mamba/short_conv.py:short_conv" +```python +--8<-- "vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer" +--8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2" +--8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm" +--8<-- "vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer" +--8<-- "vllm/model_executor/layers/mamba/short_conv.py:short_conv" +``` **8. MoE:** ---8<-- "../../vllm/model_executor/layers/fused_moe/layer.py:fused_moe" ---8<-- "../../vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe" ---8<-- "../../vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe" ---8<-- "../../vllm/model_executor/models/transformers/moe.py:transformers_fused_moe" ---8<-- "../../vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk" +```python +--8<-- "vllm/model_executor/layers/fused_moe/layer.py:fused_moe" +--8<-- "vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe" +--8<-- "vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe" +--8<-- "vllm/model_executor/models/transformers/moe.py:transformers_fused_moe" +--8<-- "vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk" +``` **9. Norm:** ---8<-- "../../vllm/model_executor/layers/layernorm.py:rms_norm" ---8<-- "../../vllm/model_executor/layers/layernorm.py:rms_norm_gated" ---8<-- "../../vllm/model_executor/layers/layernorm.py:gemma_rms_norm" +```python +--8<-- "vllm/model_executor/layers/layernorm.py:rms_norm" +--8<-- "vllm/model_executor/layers/layernorm.py:rms_norm_gated" +--8<-- "vllm/model_executor/layers/layernorm.py:gemma_rms_norm" +``` **10. Quantization:** ---8<-- "../../vllm/model_executor/layers/quantization/input_quant_fp8.py:quant_fp8" +```python +--8<-- "vllm/model_executor/layers/quantization/input_quant_fp8.py:quant_fp8" +``` **11. Rope:** ---8<-- "../../vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding" ---8<-- "../../vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding" ---8<-- "../../vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb" +```python +--8<-- "vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding" +--8<-- "vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding" +--8<-- "vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb" +``` ## Guidelines for Implementing a New CustomOp From b0846c1c2375a4e40ac171700f89fffd88fbec75 Mon Sep 17 00:00:00 2001 From: shen-shanshan <467638484@qq.com> Date: Wed, 24 Dec 2025 07:29:48 +0000 Subject: [PATCH 6/6] update format Signed-off-by: shen-shanshan <467638484@qq.com> --- docs/design/custom_op.md | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/docs/design/custom_op.md b/docs/design/custom_op.md index 0345e42d07e79..41df8fec084cd 100644 --- a/docs/design/custom_op.md +++ b/docs/design/custom_op.md @@ -61,6 +61,7 @@ For example: ```python --8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn" + --8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention" ``` @@ -68,15 +69,25 @@ For example: ```python --8<-- "vllm/model_executor/layers/activation.py:silu_and_mul" + --8<-- "vllm/model_executor/layers/activation.py:mul_and_silu" + --8<-- "vllm/model_executor/layers/activation.py:gelu_new" + --8<-- "vllm/model_executor/layers/activation.py:gelu_fast" + --8<-- "vllm/model_executor/layers/activation.py:quick_gelu" + --8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul" + --8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul_sparse" + --8<-- "vllm/model_executor/layers/activation.py:relu2" + --8<-- "vllm/model_executor/layers/activation.py:xielu" + --8<-- "vllm/model_executor/layers/activation.py:swigluoai_and_mul" + --8<-- "vllm/model_executor/layers/activation.py:fatrelu_and_mul" ``` @@ -84,6 +95,7 @@ For example: ```python --8<-- "vllm/model_executor/layers/conv.py:conv2d" + --8<-- "vllm/model_executor/layers/conv.py:conv3d" ``` @@ -91,6 +103,7 @@ For example: ```python --8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding" + --8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head" ``` @@ -98,8 +111,10 @@ For example: ```python --8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear" ---8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear:column_parallel_linear" ---8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear:replicated_linear" + +--8<-- "vllm/model_executor/layers/linear.py:column_parallel_linear" + +--8<-- "vllm/model_executor/layers/linear.py:replicated_linear" ``` **6. Logits Processor:** @@ -112,9 +127,13 @@ For example: ```python --8<-- "vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer" + --8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2" + --8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm" + --8<-- "vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer" + --8<-- "vllm/model_executor/layers/mamba/short_conv.py:short_conv" ``` @@ -122,9 +141,13 @@ For example: ```python --8<-- "vllm/model_executor/layers/fused_moe/layer.py:fused_moe" + --8<-- "vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe" + --8<-- "vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe" + --8<-- "vllm/model_executor/models/transformers/moe.py:transformers_fused_moe" + --8<-- "vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk" ``` @@ -132,7 +155,9 @@ For example: ```python --8<-- "vllm/model_executor/layers/layernorm.py:rms_norm" + --8<-- "vllm/model_executor/layers/layernorm.py:rms_norm_gated" + --8<-- "vllm/model_executor/layers/layernorm.py:gemma_rms_norm" ``` @@ -146,7 +171,9 @@ For example: ```python --8<-- "vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding" + --8<-- "vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding" + --8<-- "vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb" ```