Merge b0846c1c2375a4e40ac171700f89fffd88fbec75 into 254f6b986720c92ddf97fbb1a6a6465da8e87e29

This commit is contained in:
Shanshan Shen 2025-12-25 00:06:53 +00:00 committed by GitHub
commit a5f89df8c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 441 additions and 5 deletions

318
docs/design/custom_op.md Normal file
View File

@ -0,0 +1,318 @@
# CustomOp
`CustomOp` is an abstract class used for dispatching the forward method of various operations to the appropriate backend. It also offers a mechanism for both vLLM and OOT (Out-Of-Tree) plugins to register their custom operations.
This document will introduce how CustomOp works in vLLM and how to implement a new `CustomOp`.
## How CustomOp Works in vLLM
`CustomOp` manages two dictionaries of all custom ops (i.e., op classes, indexed by registered name) in its class, for vLLM and OOT plugins respectively.
??? code
```python
class CustomOp(nn.Module):
op_registry: dict[str, type["CustomOp"]] = {}
op_registry_oot: dict[str, type["CustomOp"]] = {}
```
We can use `@CustomOp.register("op_name")` to register an op class to the `CustomOp` system. After this, the `op_name` and its class will be added into the `op_registry` dictionary. In addition, We can also register an OOT op by `@CustomOp.register_oot("op_name")`. We will introduce this mechanism in detail later.
When a `CustomOp` is called (i.e., call its `forward()` method), if it is enabled, it will automatically dispatch the forward method to the appropriate backend according to `current_platform`. Otherwise (i.e., it is disabled), it will only call the `forward_native()` method to use PyTorch-native implementation of this forward method.
- **CPU platform:** dispatch to `forward_cpu()`.
- **CUDA platform:** dispatch to `forward_cuda()`.
- **ROCm platform:** dispatch to `forward_hip()`. If `forward_hip()` is not implemented, it will use `forward_cuda()` as a fallback.
- **XPU platform:** dispatch to `forward_xpu()`.
- **TPU platform:** dispatch to `forward_tpu()`.
- **OOT platform:** dispatch to `forward_oot()`. This will only be called on OOT platforms.
- **Default:** dispatch to `forward_native()` as a final fallback for all platforms.
!!! note
Note that the dispatching logic might not be absolute because of class inheritance. Derived class might override the behavior.
Furthur more, vLLM decides whether enable or disable a `CustomOp` by `compilation_config.custom_ops`. To be specific, if a `CustomOp` is not registered (i.e., use default config), it will be enabled if there is a `all` in `compilation_config.custom_ops` or will be disabled if there is a `none`.
!!! note
Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`.
By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with torch compile mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops.
!!! note
For multi-modal models, vLLM has enforece enabled some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level.
Note that this `enforce_enable` mechanism will be removed after we adding a separate `compilation_config` for multi-modal part.
## How to Customise Your Configuration for CustomOp
vLLM also offers fine-grained control over which custom ops to enable or disable for users, by manually passing a `--compilation_config.custom_ops '["..."]'` when launching a server.
For example:
- Use `--compilation_config.custom_ops '["all"]'` to enable all custom ops.
- Use `--compilation_config.custom_ops '["none"]'` to disable all custom ops.
- Use `--compilation_config.custom_ops '["all,-op1"]'` to enable all custom ops except op1 (i.e., prefixed with a `-` means "disable").
- Use `--compilation_config.custom_ops '["none,+op1,+op2"]'` to only enable op1 and op2 (i.e., prefixed with a `+` means "enable").
## Types of Supported CustomOp in vLLM
**1. Attention:**
```python
--8<-- "vllm/attention/layers/mm_encoder_attention.py:mm_encoder_attn"
--8<-- "vllm/model_executor/layers/mla.py:multi_head_latent_attention"
```
**2. Activation:**
```python
--8<-- "vllm/model_executor/layers/activation.py:silu_and_mul"
--8<-- "vllm/model_executor/layers/activation.py:mul_and_silu"
--8<-- "vllm/model_executor/layers/activation.py:gelu_new"
--8<-- "vllm/model_executor/layers/activation.py:gelu_fast"
--8<-- "vllm/model_executor/layers/activation.py:quick_gelu"
--8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul"
--8<-- "vllm/model_executor/layers/activation.py:gelu_and_mul_sparse"
--8<-- "vllm/model_executor/layers/activation.py:relu2"
--8<-- "vllm/model_executor/layers/activation.py:xielu"
--8<-- "vllm/model_executor/layers/activation.py:swigluoai_and_mul"
--8<-- "vllm/model_executor/layers/activation.py:fatrelu_and_mul"
```
**3. MM-Conv:**
```python
--8<-- "vllm/model_executor/layers/conv.py:conv2d"
--8<-- "vllm/model_executor/layers/conv.py:conv3d"
```
**4. Embedding:**
```python
--8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:vocab_parallel_embedding"
--8<-- "vllm/model_executor/layers/vocab_parallel_embedding.py:parallel_lm_head"
```
**5. Linear:**
```python
--8<-- "vllm/model_executor/layers/linear.py:row_parallel_linear"
--8<-- "vllm/model_executor/layers/linear.py:column_parallel_linear"
--8<-- "vllm/model_executor/layers/linear.py:replicated_linear"
```
**6. Logits Processor:**
```python
--8<-- "vllm/model_executor/layers/logits_processor.py:logits_processor"
```
**7. Mamba:**
```python
--8<-- "vllm/model_executor/layers/mamba/mamba_mixer.py:mamba_mixer"
--8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mamba_mixer2"
--8<-- "vllm/model_executor/layers/mamba/mamba_mixer2.py:mixer2_gated_rms_norm"
--8<-- "vllm/model_executor/models/plamo2.py:plamo2_mamba_mixer"
--8<-- "vllm/model_executor/layers/mamba/short_conv.py:short_conv"
```
**8. MoE:**
```python
--8<-- "vllm/model_executor/layers/fused_moe/layer.py:fused_moe"
--8<-- "vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py:modular_fused_moe"
--8<-- "vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py:unquantized_fused_moe"
--8<-- "vllm/model_executor/models/transformers/moe.py:transformers_fused_moe"
--8<-- "vllm/model_executor/layers/fused_moe/fused_moe.py:grouped_topk"
```
**9. Norm:**
```python
--8<-- "vllm/model_executor/layers/layernorm.py:rms_norm"
--8<-- "vllm/model_executor/layers/layernorm.py:rms_norm_gated"
--8<-- "vllm/model_executor/layers/layernorm.py:gemma_rms_norm"
```
**10. Quantization:**
```python
--8<-- "vllm/model_executor/layers/quantization/input_quant_fp8.py:quant_fp8"
```
**11. Rope:**
```python
--8<-- "vllm/model_executor/layers/rotary_embedding/base.py:rotary_embedding"
--8<-- "vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py:dual_chunk_rotary_embedding"
--8<-- "vllm/model_executor/layers/rotary_embedding/common.py:apply_rotary_emb"
```
## Guidelines for Implementing a New CustomOp
### Implement a New CustomOp in vLLM
This part is a tutorial of how to implement a New `CustomOp` in vLLM.
Steps:
1. Implement a new op class, which extends from `CustomOp` base class.
2. Add the `@CustomOp.register("op_name")` decorator on this op class to register it into `CustomOp` system.
3. Implement different `forward_xxx()` method according to your needs.
Taking `MMEncoderAttention` as an example:
??? code
```python
@CustomOp.register("mm_encoder_attn")
class MMEncoderAttention(CustomOp):
def __init__(
self,
num_heads: int,
head_size: int,
scale: float | None = None,
num_kv_heads: int | None = None,
prefix: str = "",
multimodal_config: MultiModalConfig | None = None,
) -> None:
super().__init__()
# Init...
def forward_native(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
cu_seqlens: torch.Tensor | None = None,
max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention
) -> torch.Tensor:
# Call TORCH_SDPA implementation...
def forward_cuda(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
cu_seqlens: torch.Tensor | None = None,
max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention
) -> torch.Tensor:
# Call FA or TORCH_SDPA implementation...
def forward_cpu(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
cu_seqlens: torch.Tensor | None = None,
max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention
) -> torch.Tensor:
# Call TORCH_SDPA implementation...
def forward_xpu(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
cu_seqlens: torch.Tensor | None = None,
max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention
) -> torch.Tensor:
# Call FA implementation...
def forward_tpu(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
cu_seqlens: torch.Tensor | None = None,
max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention
) -> torch.Tensor:
# Call PALLAS implementation...
```
### Register a New CustomOp in OOT Device Plugins
Currently, thanks to [vLLM's hardware-plugin mechanism](./plugin_system.md), there are various OOT device plugins emerging out to enable vLLM seamlessly runs on different hardwares. You can also find more details about this mechanism at [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
- **Official device plugins:** [vllm-ascend](https://github.com/vllm-project/vllm-ascend) (for Huawei Ascend NPU), [vllm-spyre](https://github.com/vllm-project/vllm-spyre)
(for Spyre), [vllm-gaudi](https://github.com/vllm-project/vllm-gaudi) (for Intel Gaudi), [vllm-neuron](https://github.com/vllm-project/vllm-neuron) (for AWS Neuron), [vllm-meta](https://github.com/vllm-project/vllm-metal) (for Apple Silicon), etc.
- **Non-official device plugins:** [vllm-metax](https://github.com/MetaX-MACA/vLLM-metax) (for MetaX GPU), [vllm-kunlun](https://github.com/baidu/vLLM-Kunlun) (for Baidu Kunlun XPU), etc.
In this case, `CustomOp` can enable these hardware manufacturers to seamlessly replace vLLM's operations with their deep-optimized kernels for specific devices at runtime, by just registering an OOT `CustomOp` and implementing the `forward_oot()` method.
Now, this part will show you how to register an OOT `CustomOp` for a device plugin.
Taking `MMEncoderAttention` as an example:
1. Implement a `CustomMMEncoderAttention` class which extends from `MMEncoderAttention` and implement its `forward_oot()` method.
2. Register your `CustomMMEncoderAttention` into vLLM to replace `MMEncoderAttention`.
??? code
```python
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
from vllm.model_executor.custom_op import CustomOp
@CustomOp.register_oot("MMEncoderAttention")
class CustomMMEncoderAttention(MMEncoderAttention):
def __init__(...):
super().__init__(...)
def forward_oot(...):
# Call optimized device-specific kernels.
...
```
In this case, a new item `{"MMEncoderAttention": CustomMMEncoderAttention}` will be added into `op_registry_oot`. When initializing a `MMEncoderAttention` op object, if the class name (i.e., `MMEncoderAttention`) is contained in the keys of `op_registry_oot`, vLLM will replace it with our registered class (i.e., `CustomMMEncoderAttention`) and instantiate it.
After that, when this `MMEncoderAttention` op is called, your `forward_oot()` will be called if it is enabled. Thus, you will get expected performance on your hardwares without directly modify vLLM.
In addition, you can also register all your `CustomOp` at one place for better management.
??? code
```python
from vllm.model_executor.custom_op import CustomOp
REGISTERED_CUSTOM_OPS = {
"CustomOP1": YourCustomOp1,
"CustomOP2": YourCustomOp2,
"CustomOP3": YourCustomOp3,
}
for op_name, op_cls in REGISTERED_CUSTOM_OPS.items():
CustomOp.register_oot(_decorated_op_cls=op_cls, name=op_name)
```

View File

@ -18,10 +18,13 @@ from vllm.model_executor.models.vision import get_vit_attn_backend
logger = init_logger(__name__) logger = init_logger(__name__)
# --8<-- [start:mm_encoder_attn]
@CustomOp.register("mm_encoder_attn") @CustomOp.register("mm_encoder_attn")
class MMEncoderAttention(CustomOp): class MMEncoderAttention(CustomOp):
"""Multi-headed attention without any cache, used for multimodal encoder.""" """Multi-headed attention without any cache, used for multimodal encoder."""
# --8<-- [end:mm_encoder_attn]
def __init__( def __init__(
self, self,
num_heads: int, num_heads: int,

View File

@ -404,7 +404,8 @@ class CompilationConfig:
- 'none,+op1,+op2' to enable only op1 and op2 - 'none,+op1,+op2' to enable only op1 and op2
By default, all custom ops are enabled when running without Inductor and By default, all custom ops are enabled when running without Inductor and
disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor". disabled when running with Inductor: mode>CompilationMode.NONE and
backend="inductor".
Inductor generates (fused) Triton kernels for disabled custom ops.""" Inductor generates (fused) Triton kernels for disabled custom ops."""
splitting_ops: list[str] | None = None splitting_ops: list[str] | None = None
"""A list of ops to exclude from cudagraphs, used in piecewise compilation. """A list of ops to exclude from cudagraphs, used in piecewise compilation.

View File

@ -86,9 +86,12 @@ class CustomOp(nn.Module):
# specific backend. Currently, we do not support dynamic dispatching. # specific backend. Currently, we do not support dynamic dispatching.
compilation_config = get_cached_compilation_config() compilation_config = get_cached_compilation_config()
# CustomOp object can be enforce enabled, e.g., enable device-specific # NOTE(shen-shanshan): CustomOp object can be enforce enabled, e.g.,
# kernels in ViT models when enabling graph mode. By default, it will # enable device-specific kernels in ViT models when enabling graph
# follow the compilation_config to determine whether enable itself. # mode. By default, it will follow the compilation_config to determine
# whether enable itself.
# This enforce_enable mechanism will be removed after we adding a
# separate compilation_config for multi-modal part.
enabled = self._enforce_enable or self.enabled() enabled = self._enforce_enable or self.enabled()
if enabled: if enabled:
compilation_config.enabled_custom_ops.update([self.__class__.name]) compilation_config.enabled_custom_ops.update([self.__class__.name])

View File

@ -22,6 +22,7 @@ from vllm.utils.collection_utils import LazyDict
logger = init_logger(__name__) logger = init_logger(__name__)
# --8<-- [start:fatrelu_and_mul]
@CustomOp.register("fatrelu_and_mul") @CustomOp.register("fatrelu_and_mul")
class FatreluAndMul(CustomOp): class FatreluAndMul(CustomOp):
"""An activation function for FATReLU. """An activation function for FATReLU.
@ -35,6 +36,8 @@ class FatreluAndMul(CustomOp):
return: (num_tokens, d) or (batch_size, seq_len, d) return: (num_tokens, d) or (batch_size, seq_len, d)
""" """
# --8<-- [end:fatrelu_and_mul]
def __init__(self, threshold: float = 0.0): def __init__(self, threshold: float = 0.0):
super().__init__() super().__init__()
self.threshold = threshold self.threshold = threshold
@ -58,6 +61,7 @@ class FatreluAndMul(CustomOp):
return out return out
# --8<-- [start:silu_and_mul]
@CustomOp.register("silu_and_mul") @CustomOp.register("silu_and_mul")
class SiluAndMul(CustomOp): class SiluAndMul(CustomOp):
"""An activation function for SwiGLU. """An activation function for SwiGLU.
@ -69,6 +73,8 @@ class SiluAndMul(CustomOp):
return: (num_tokens, d) or (batch_size, seq_len, d) return: (num_tokens, d) or (batch_size, seq_len, d)
""" """
# --8<-- [end:silu_and_mul]
def __init__(self): def __init__(self):
super().__init__() super().__init__()
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
@ -101,6 +107,7 @@ class SiluAndMul(CustomOp):
return out return out
# --8<-- [start:mul_and_silu]
@CustomOp.register("mul_and_silu") @CustomOp.register("mul_and_silu")
class MulAndSilu(CustomOp): class MulAndSilu(CustomOp):
"""An activation function for SwiGLU. """An activation function for SwiGLU.
@ -112,6 +119,8 @@ class MulAndSilu(CustomOp):
return: (num_tokens, d) or (batch_size, seq_len, d) return: (num_tokens, d) or (batch_size, seq_len, d)
""" """
# --8<-- [end:mul_and_silu]
def __init__(self): def __init__(self):
super().__init__() super().__init__()
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
@ -139,6 +148,7 @@ class MulAndSilu(CustomOp):
# def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
# --8<-- [start:gelu_and_mul_sparse]
@CustomOp.register("gelu_and_mul_sparse") @CustomOp.register("gelu_and_mul_sparse")
class GeluAndMulSparse(CustomOp): class GeluAndMulSparse(CustomOp):
"""An activation function for GeluAndMulSparse. """An activation function for GeluAndMulSparse.
@ -153,6 +163,8 @@ class GeluAndMulSparse(CustomOp):
return: (num_tokens, d) or (batch_size, seq_len, d) return: (num_tokens, d) or (batch_size, seq_len, d)
""" """
# --8<-- [end:gelu_and_mul_sparse]
def __init__(self, activation_sparsity: float, approximate: str = "none"): def __init__(self, activation_sparsity: float, approximate: str = "none"):
super().__init__() super().__init__()
# Gelu. # Gelu.
@ -195,6 +207,7 @@ class GeluAndMulSparse(CustomOp):
return self.forward_native(x) return self.forward_native(x)
# --8<-- [start:gelu_and_mul]
@CustomOp.register("gelu_and_mul") @CustomOp.register("gelu_and_mul")
class GeluAndMul(CustomOp): class GeluAndMul(CustomOp):
"""An activation function for GeGLU. """An activation function for GeGLU.
@ -206,6 +219,8 @@ class GeluAndMul(CustomOp):
return: (batch_size, seq_len, d) or (num_tokens, d) return: (batch_size, seq_len, d) or (num_tokens, d)
""" """
# --8<-- [end:gelu_and_mul]
def __init__(self, approximate: str = "none"): def __init__(self, approximate: str = "none"):
super().__init__() super().__init__()
self.approximate = approximate self.approximate = approximate
@ -257,9 +272,12 @@ class GeluAndMul(CustomOp):
return f"approximate={repr(self.approximate)}" return f"approximate={repr(self.approximate)}"
# --8<-- [start:swigluoai_and_mul]
@CustomOp.register("swigluoai_and_mul") @CustomOp.register("swigluoai_and_mul")
class SwigluOAIAndMul(CustomOp): class SwigluOAIAndMul(CustomOp):
# https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110 # https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/models/gpt_oss/modeling_gpt_oss.py#L106-L110
# --8<-- [end:swigluoai_and_mul]
def __init__(self, alpha: float = 1.702, limit: float = 7.0): def __init__(self, alpha: float = 1.702, limit: float = 7.0):
super().__init__() super().__init__()
self.alpha = alpha self.alpha = alpha
@ -286,8 +304,11 @@ class SwigluOAIAndMul(CustomOp):
return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}" return f"alpha={repr(self.alpha)}, limit={repr(self.limit)}"
# --8<-- [start:gelu_new]
@CustomOp.register("gelu_new") @CustomOp.register("gelu_new")
class NewGELU(CustomOp): class NewGELU(CustomOp):
# --8<-- [end:gelu_new]
def __init__(self): def __init__(self):
super().__init__() super().__init__()
if current_platform.is_cuda_alike() or current_platform.is_cpu(): if current_platform.is_cuda_alike() or current_platform.is_cpu():
@ -311,8 +332,11 @@ class NewGELU(CustomOp):
return self.op(x) return self.op(x)
# --8<-- [start:gelu_fast]
@CustomOp.register("gelu_fast") @CustomOp.register("gelu_fast")
class FastGELU(CustomOp): class FastGELU(CustomOp):
# --8<-- [end:gelu_fast]
def __init__(self): def __init__(self):
super().__init__() super().__init__()
if current_platform.is_cuda_alike() or current_platform.is_cpu(): if current_platform.is_cuda_alike() or current_platform.is_cpu():
@ -335,9 +359,12 @@ class FastGELU(CustomOp):
return self.op(x) return self.op(x)
# --8<-- [start:quick_gelu]
@CustomOp.register("quick_gelu") @CustomOp.register("quick_gelu")
class QuickGELU(CustomOp): class QuickGELU(CustomOp):
# https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90 # https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L90
# --8<-- [end:quick_gelu]
def __init__(self): def __init__(self):
super().__init__() super().__init__()
if current_platform.is_cuda_alike() or current_platform.is_cpu(): if current_platform.is_cuda_alike() or current_platform.is_cpu():
@ -365,12 +392,15 @@ class QuickGELU(CustomOp):
# def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
# --8<-- [start:relu2]
@CustomOp.register("relu2") @CustomOp.register("relu2")
class ReLUSquaredActivation(CustomOp): class ReLUSquaredActivation(CustomOp):
""" """
Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2 Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
""" """
# --8<-- [end:relu2]
def forward_native(self, x: torch.Tensor) -> torch.Tensor: def forward_native(self, x: torch.Tensor) -> torch.Tensor:
"""PyTorch-native implementation equivalent to forward().""" """PyTorch-native implementation equivalent to forward()."""
return torch.square(F.relu(x)) return torch.square(F.relu(x))
@ -380,6 +410,7 @@ class ReLUSquaredActivation(CustomOp):
return self.forward_native(x) return self.forward_native(x)
# --8<-- [start:xielu]
@CustomOp.register("xielu") @CustomOp.register("xielu")
class XIELU(CustomOp): class XIELU(CustomOp):
""" """
@ -388,6 +419,8 @@ class XIELU(CustomOp):
Otherwise, we emit a single warning and use xIELU Python Otherwise, we emit a single warning and use xIELU Python
""" """
# --8<-- [end:xielu]
def __init__( def __init__(
self, self,
alpha_p_init: float = 0.8, alpha_p_init: float = 0.8,

View File

@ -105,10 +105,13 @@ class ConvLayerBase(CustomOp):
return s return s
# --8<-- [start:conv2d]
@CustomOp.register("conv2d") @CustomOp.register("conv2d")
class Conv2dLayer(ConvLayerBase): class Conv2dLayer(ConvLayerBase):
"""Conv layer with Conv2d.""" """Conv layer with Conv2d."""
# --8<-- [end:conv2d]
num_dim = 2 num_dim = 2
def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
@ -204,10 +207,13 @@ class CausalConv2dLayer(Conv2dLayer):
return x return x
# --8<-- [start:conv3d]
@CustomOp.register("conv3d") @CustomOp.register("conv3d")
class Conv3dLayer(ConvLayerBase): class Conv3dLayer(ConvLayerBase):
"""Conv layer with Conv3d.""" """Conv layer with Conv3d."""
# --8<-- [end:conv3d]
num_dim = 3 num_dim = 3
def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor: def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:

View File

@ -1283,10 +1283,13 @@ def grouped_topk(
return topk_weights.to(torch.float32), topk_ids.to(torch.int32) return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
# --8<-- [start:grouped_topk]
@CustomOp.register("grouped_topk") @CustomOp.register("grouped_topk")
class GroupedTopk(CustomOp): class GroupedTopk(CustomOp):
"""GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model.""" """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model."""
# --8<-- [end:grouped_topk]
def __init__( def __init__(
self, self,
topk: int, topk: int,

View File

@ -20,8 +20,11 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
logger = init_logger(__name__) logger = init_logger(__name__)
# --8<-- [start:modular_fused_moe]
@CustomOp.register("modular_fused_moe") @CustomOp.register("modular_fused_moe")
class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
# --8<-- [end:modular_fused_moe]
def __init__( def __init__(
self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel self, old_quant_method: FusedMoEMethodBase, experts: FusedMoEModularKernel
): ):

View File

@ -297,6 +297,7 @@ def maybe_roundup_hidden_size(
return hidden_size return hidden_size
# --8<-- [start:fused_moe]
@CustomOp.register("fused_moe") @CustomOp.register("fused_moe")
class FusedMoE(CustomOp): class FusedMoE(CustomOp):
"""FusedMoE layer for MoE models. """FusedMoE layer for MoE models.
@ -320,6 +321,8 @@ class FusedMoE(CustomOp):
enable_eplb: Whether to enable expert parallelism load balancer. enable_eplb: Whether to enable expert parallelism load balancer.
""" """
# --8<-- [end:fused_moe]
def __init__( def __init__(
self, self,
num_experts: int, # Global number of experts num_experts: int, # Global number of experts

View File

@ -46,10 +46,13 @@ else:
logger = init_logger(__name__) logger = init_logger(__name__)
# --8<-- [start:unquantized_fused_moe]
@CustomOp.register("unquantized_fused_moe") @CustomOp.register("unquantized_fused_moe")
class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
"""MoE method without quantization.""" """MoE method without quantization."""
# --8<-- [end:unquantized_fused_moe]
def __init__(self, moe: FusedMoEConfig): def __init__(self, moe: FusedMoEConfig):
super().__init__(moe) super().__init__(moe)

View File

@ -88,6 +88,7 @@ def dispatch_rocm_rmsnorm_func(
return rms_norm return rms_norm
# --8<-- [start:rms_norm]
@CustomOp.register("rms_norm") @CustomOp.register("rms_norm")
class RMSNorm(CustomOp): class RMSNorm(CustomOp):
"""Root mean square normalization. """Root mean square normalization.
@ -96,6 +97,8 @@ class RMSNorm(CustomOp):
Refer to https://arxiv.org/abs/1910.07467 Refer to https://arxiv.org/abs/1910.07467
""" """
# --8<-- [end:rms_norm]
def __init__( def __init__(
self, self,
hidden_size: int, hidden_size: int,
@ -253,6 +256,7 @@ class RMSNorm(CustomOp):
return s return s
# --8<-- [start:gemma_rms_norm]
@CustomOp.register("gemma_rms_norm") @CustomOp.register("gemma_rms_norm")
class GemmaRMSNorm(CustomOp): class GemmaRMSNorm(CustomOp):
"""RMS normalization for Gemma. """RMS normalization for Gemma.
@ -262,6 +266,8 @@ class GemmaRMSNorm(CustomOp):
2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w. 2. (x * w).to(orig_dtype) instead of x.to(orig_dtype) * w.
""" """
# --8<-- [end:gemma_rms_norm]
def __init__( def __init__(
self, self,
hidden_size: int, hidden_size: int,
@ -321,6 +327,7 @@ class GemmaRMSNorm(CustomOp):
return self.forward_native(x, residual) return self.forward_native(x, residual)
# --8<-- [start:rms_norm_gated]
@CustomOp.register("rms_norm_gated") @CustomOp.register("rms_norm_gated")
class RMSNormGated(CustomOp): class RMSNormGated(CustomOp):
"""RMS Normalization with optional gating. """RMS Normalization with optional gating.
@ -331,6 +338,8 @@ class RMSNormGated(CustomOp):
- Optional gating with SiLU activation - Optional gating with SiLU activation
""" """
# --8<-- [end:rms_norm_gated]
def __init__( def __init__(
self, self,
hidden_size: int, hidden_size: int,

View File

@ -296,6 +296,7 @@ class LinearBase(CustomOp):
param.tp_size = self.tp_size param.tp_size = self.tp_size
# --8<-- [start:replicated_linear]
@CustomOp.register("replicated_linear") @CustomOp.register("replicated_linear")
class ReplicatedLinear(LinearBase): class ReplicatedLinear(LinearBase):
"""Replicated linear layer. """Replicated linear layer.
@ -313,6 +314,8 @@ class ReplicatedLinear(LinearBase):
disable_tp: Take no effect for replicated linear layers. disable_tp: Take no effect for replicated linear layers.
""" """
# --8<-- [end:replicated_linear]
def __init__( def __init__(
self, self,
input_size: int, input_size: int,
@ -413,6 +416,7 @@ class ReplicatedLinear(LinearBase):
return s return s
# --8<-- [start:column_parallel_linear]
@CustomOp.register("column_parallel_linear") @CustomOp.register("column_parallel_linear")
class ColumnParallelLinear(LinearBase): class ColumnParallelLinear(LinearBase):
"""Linear layer with column parallelism. """Linear layer with column parallelism.
@ -440,6 +444,8 @@ class ColumnParallelLinear(LinearBase):
disable_tp: If true, weights matrix won't be sharded through tp rank. disable_tp: If true, weights matrix won't be sharded through tp rank.
""" """
# --8<-- [end:column_parallel_linear]
def __init__( def __init__(
self, self,
input_size: int, input_size: int,
@ -1276,6 +1282,7 @@ class QKVParallelLinear(ColumnParallelLinear):
param_data.copy_(loaded_weight) param_data.copy_(loaded_weight)
# --8<-- [start:row_parallel_linear]
@CustomOp.register("row_parallel_linear") @CustomOp.register("row_parallel_linear")
class RowParallelLinear(LinearBase): class RowParallelLinear(LinearBase):
"""Linear layer with row parallelism. """Linear layer with row parallelism.
@ -1310,6 +1317,8 @@ class RowParallelLinear(LinearBase):
disable_tp: If true, weights matrix won't be sharded through tp rank. disable_tp: If true, weights matrix won't be sharded through tp rank.
""" """
# --8<-- [end:row_parallel_linear]
def __init__( def __init__(
self, self,
input_size: int, input_size: int,

View File

@ -13,6 +13,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmb
from vllm.platforms import current_platform from vllm.platforms import current_platform
# --8<-- [start:logits_processor]
@CustomOp.register("logits_processor") @CustomOp.register("logits_processor")
class LogitsProcessor(CustomOp): class LogitsProcessor(CustomOp):
"""Process logits and apply logits processors from sampling metadata. """Process logits and apply logits processors from sampling metadata.
@ -23,6 +24,8 @@ class LogitsProcessor(CustomOp):
3. Apply logits processors (if any). 3. Apply logits processors (if any).
""" """
# --8<-- [end:logits_processor]
def __init__( def __init__(
self, self,
vocab_size: int, vocab_size: int,

View File

@ -39,6 +39,7 @@ from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionMetadata
# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
# --8<-- [start:mamba_mixer]
@CustomOp.register("mamba_mixer") @CustomOp.register("mamba_mixer")
class MambaMixer(MambaBase, CustomOp): class MambaMixer(MambaBase, CustomOp):
""" """
@ -51,6 +52,8 @@ class MambaMixer(MambaBase, CustomOp):
**selective** state spaces) **selective** state spaces)
""" """
# --8<-- [end:mamba_mixer]
def __init__( def __init__(
self, self,
hidden_size: int, hidden_size: int,

View File

@ -49,8 +49,11 @@ from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadata
# Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated # Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated
# --8<-- [start:mixer2_gated_rms_norm]
@CustomOp.register("mixer2_gated_rms_norm") @CustomOp.register("mixer2_gated_rms_norm")
class Mixer2RMSNormGated(CustomOp): class Mixer2RMSNormGated(CustomOp):
# --8<-- [end:mixer2_gated_rms_norm]
def __init__( def __init__(
self, self,
full_hidden_size: int, full_hidden_size: int,
@ -214,6 +217,7 @@ def mamba_v2_sharded_weight_loader(
# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
# --8<-- [start:mamba_mixer2]
@CustomOp.register("mamba_mixer2") @CustomOp.register("mamba_mixer2")
class MambaMixer2(MambaBase, CustomOp): class MambaMixer2(MambaBase, CustomOp):
""" """
@ -226,6 +230,8 @@ class MambaMixer2(MambaBase, CustomOp):
**selective** state spaces) **selective** state spaces)
""" """
# --8<-- [end:mamba_mixer2]
def __init__( def __init__(
self, self,
hidden_size: int, hidden_size: int,

View File

@ -27,8 +27,11 @@ from vllm.utils.torch_utils import direct_register_custom_op
from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionMetadata
# --8<-- [start:short_conv]
@CustomOp.register("short_conv") @CustomOp.register("short_conv")
class ShortConv(MambaBase, CustomOp): class ShortConv(MambaBase, CustomOp):
# --8<-- [end:short_conv]
def __init__( def __init__(
self, self,
config, config,

View File

@ -29,6 +29,7 @@ class MLAModules:
indexer_rotary_emb: torch.nn.Module | None = None indexer_rotary_emb: torch.nn.Module | None = None
# --8<-- [start:multi_head_latent_attention]
@CustomOp.register("multi_head_latent_attention") @CustomOp.register("multi_head_latent_attention")
class MultiHeadLatentAttentionWrapper(CustomOp): class MultiHeadLatentAttentionWrapper(CustomOp):
"""MLA layer registered as CustomOp to allow OOT backends to add """MLA layer registered as CustomOp to allow OOT backends to add
@ -47,6 +48,8 @@ class MultiHeadLatentAttentionWrapper(CustomOp):
3. Return the output tensor. 3. Return the output tensor.
""" """
# --8<-- [end:multi_head_latent_attention]
def __init__( def __init__(
self, self,
hidden_size: int, hidden_size: int,

View File

@ -19,6 +19,7 @@ _FP8_MIN = -224.0 if current_platform.is_fp8_fnuz() else _FP8_FINFO.min
_FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0) _FP8_MIN_SCALING_FACTOR = 1.0 / (_FP8_MAX * 512.0)
# --8<-- [start:quant_fp8]
@CustomOp.register("quant_fp8") @CustomOp.register("quant_fp8")
class QuantFP8(CustomOp): class QuantFP8(CustomOp):
""" """
@ -26,6 +27,8 @@ class QuantFP8(CustomOp):
This CustomOp supports both static and dynamic quantization. This CustomOp supports both static and dynamic quantization.
""" """
# --8<-- [end:quant_fp8]
def __init__( def __init__(
self, self,
static: bool, static: bool,

View File

@ -10,10 +10,13 @@ from vllm.model_executor.custom_op import CustomOp
from .common import ApplyRotaryEmb from .common import ApplyRotaryEmb
# --8<-- [start:rotary_embedding]
@CustomOp.register("rotary_embedding") @CustomOp.register("rotary_embedding")
class RotaryEmbeddingBase(CustomOp): class RotaryEmbeddingBase(CustomOp):
"""Original rotary positional embedding.""" """Original rotary positional embedding."""
# --8<-- [end:rotary_embedding]
def __init__( def __init__(
self, self,
head_size: int, head_size: int,

View File

@ -118,8 +118,11 @@ direct_register_custom_op(
) )
# --8<-- [start:apply_rotary_emb]
@CustomOp.register("apply_rotary_emb") @CustomOp.register("apply_rotary_emb")
class ApplyRotaryEmb(CustomOp): class ApplyRotaryEmb(CustomOp):
# --8<-- [end:apply_rotary_emb]
def __init__( def __init__(
self, self,
enforce_enable: bool = False, enforce_enable: bool = False,

View File

@ -9,10 +9,13 @@ from vllm.model_executor.custom_op import CustomOp
from .common import rotate_gptj, rotate_neox from .common import rotate_gptj, rotate_neox
# --8<-- [start:dual_chunk_rotary_embedding]
@CustomOp.register("dual_chunk_rotary_embedding") @CustomOp.register("dual_chunk_rotary_embedding")
class DualChunkRotaryEmbedding(CustomOp): class DualChunkRotaryEmbedding(CustomOp):
"""Rotary positional embedding for Dual Chunk Attention.""" """Rotary positional embedding for Dual Chunk Attention."""
# --8<-- [end:dual_chunk_rotary_embedding]
def __init__( def __init__(
self, self,
head_size: int, head_size: int,

View File

@ -181,6 +181,7 @@ def get_masked_input_and_mask(
return input_, ~vocab_mask return input_, ~vocab_mask
# --8<-- [start:vocab_parallel_embedding]
@CustomOp.register("vocab_parallel_embedding") @CustomOp.register("vocab_parallel_embedding")
class VocabParallelEmbedding(CustomOp): class VocabParallelEmbedding(CustomOp):
"""Embedding parallelized in the vocabulary dimension. """Embedding parallelized in the vocabulary dimension.
@ -221,6 +222,8 @@ class VocabParallelEmbedding(CustomOp):
prefix: full name of the layer in the state dict prefix: full name of the layer in the state dict
""" # noqa: E501 """ # noqa: E501
# --8<-- [end:vocab_parallel_embedding]
def __init__( def __init__(
self, self,
num_embeddings: int, num_embeddings: int,
@ -492,6 +495,7 @@ class VocabParallelEmbedding(CustomOp):
return s return s
# --8<-- [start:parallel_lm_head]
@CustomOp.register("parallel_lm_head") @CustomOp.register("parallel_lm_head")
class ParallelLMHead(VocabParallelEmbedding): class ParallelLMHead(VocabParallelEmbedding):
"""Parallelized LM head. """Parallelized LM head.
@ -509,6 +513,8 @@ class ParallelLMHead(VocabParallelEmbedding):
padding_size: padding size for the vocabulary. padding_size: padding size for the vocabulary.
""" """
# --8<-- [end:parallel_lm_head]
def __init__( def __init__(
self, self,
num_embeddings: int, num_embeddings: int,

View File

@ -97,8 +97,11 @@ def is_mamba(config: Plamo2Config, i: int) -> bool:
# Adapted from: # Adapted from:
# vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2 # vllm.model_executor.layers.mamba.mamba_mixer2.MambaMixer2
# transformers.models.mamba.modeling_mamba.MambaMixer # transformers.models.mamba.modeling_mamba.MambaMixer
@CustomOp.register(name="plamo2_mamba_mixer") # --8<-- [start:plamo2_mamba_mixer]
@CustomOp.register("plamo2_mamba_mixer")
class Plamo2MambaMixer(MambaBase, CustomOp): class Plamo2MambaMixer(MambaBase, CustomOp):
# --8<-- [end:plamo2_mamba_mixer]
def __init__(self, vllm_config: VllmConfig, *, prefix: str = "", **kwargs) -> None: def __init__(self, vllm_config: VllmConfig, *, prefix: str = "", **kwargs) -> None:
super().__init__() super().__init__()
self.config = vllm_config.model_config.hf_config self.config = vllm_config.model_config.hf_config

View File

@ -37,10 +37,13 @@ if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
# --8<-- [start:transformers_fused_moe]
@CustomOp.register("transformers_fused_moe") @CustomOp.register("transformers_fused_moe")
class TransformersFusedMoE(FusedMoE): class TransformersFusedMoE(FusedMoE):
"""Custom FusedMoE for the Transformers modeling backend.""" """Custom FusedMoE for the Transformers modeling backend."""
# --8<-- [end:transformers_fused_moe]
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self._topk_ids: torch.Tensor = None self._topk_ids: torch.Tensor = None