mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-26 14:37:06 +08:00
[Fix][torch.compile] Enable custom ops by default when Inductor off (#20102)
Signed-off-by: luka <luka@neuralmagic.com>
This commit is contained in:
parent
94a55c7681
commit
aafabaa0d5
@ -28,42 +28,49 @@ class Relu3(ReLUSquaredActivation):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"env, torch_level, ops_enabled, default_on",
|
"env, torch_level, use_inductor, ops_enabled, default_on",
|
||||||
[
|
[
|
||||||
# Default values based on compile level
|
# Default values based on compile level
|
||||||
("", 0, [True] * 4, True),
|
# - All by default (no Inductor compilation)
|
||||||
("", 1, [True] * 4, True),
|
("", 0, False, [True] * 4, True),
|
||||||
("", 2, [True] * 4, True), # All by default
|
("", 1, True, [True] * 4, True),
|
||||||
("", 3, [False] * 4, False),
|
("", 2, False, [True] * 4, True),
|
||||||
("", 4, [False] * 4, False), # None by default
|
# - None by default (with Inductor)
|
||||||
|
("", 3, True, [False] * 4, False),
|
||||||
|
("", 4, True, [False] * 4, False),
|
||||||
|
# - All by default (without Inductor)
|
||||||
|
("", 3, False, [True] * 4, True),
|
||||||
|
("", 4, False, [True] * 4, True),
|
||||||
# Explicitly enabling/disabling
|
# Explicitly enabling/disabling
|
||||||
#
|
#
|
||||||
# Default: all
|
# Default: all
|
||||||
#
|
#
|
||||||
# All but SiluAndMul
|
# All but SiluAndMul
|
||||||
("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True),
|
("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True),
|
||||||
# Only ReLU3
|
# Only ReLU3
|
||||||
("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False),
|
("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False),
|
||||||
# All but SiluAndMul
|
# All but SiluAndMul
|
||||||
("all,-silu_and_mul", 1, [1, 0, 1, 1], True),
|
("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True),
|
||||||
# All but ReLU3 (even if ReLU2 is on)
|
# All but ReLU3 (even if ReLU2 is on)
|
||||||
("-relu3,relu2", 1, [1, 1, 1, 0], True),
|
("-relu3,relu2", 3, False, [1, 1, 1, 0], True),
|
||||||
# GeluAndMul and SiluAndMul
|
# RMSNorm and SiluAndMul
|
||||||
("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False),
|
("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False),
|
||||||
# All but RMSNorm
|
# All but RMSNorm
|
||||||
("-rms_norm", 2, [0, 1, 1, 1], True),
|
("-rms_norm", 3, False, [0, 1, 1, 1], True),
|
||||||
#
|
#
|
||||||
# Default: none
|
# Default: none
|
||||||
#
|
#
|
||||||
# Only ReLU3
|
# Only ReLU3
|
||||||
("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False),
|
("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False),
|
||||||
# All but RMSNorm
|
# All but RMSNorm
|
||||||
("all,-rms_norm", 4, [0, 1, 1, 1], True),
|
("all,-rms_norm", 4, True, [0, 1, 1, 1], True),
|
||||||
])
|
])
|
||||||
def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int],
|
def test_enabled_ops(env: str, torch_level: int, use_inductor: bool,
|
||||||
default_on: bool):
|
ops_enabled: list[int], default_on: bool):
|
||||||
vllm_config = VllmConfig(compilation_config=CompilationConfig(
|
vllm_config = VllmConfig(
|
||||||
level=torch_level, custom_ops=env.split(",")))
|
compilation_config=CompilationConfig(use_inductor=bool(use_inductor),
|
||||||
|
level=torch_level,
|
||||||
|
custom_ops=env.split(",")))
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
assert CustomOp.default_on() == default_on
|
assert CustomOp.default_on() == default_on
|
||||||
|
|
||||||
|
|||||||
@ -3994,7 +3994,8 @@ class CompilationConfig:
|
|||||||
- 'none,+op1,+op2' to enable only op1 and op2
|
- 'none,+op1,+op2' to enable only op1 and op2
|
||||||
|
|
||||||
By default, all custom ops are enabled when running without Inductor and
|
By default, all custom ops are enabled when running without Inductor and
|
||||||
disabled when running with Inductor (compile_level >= Inductor)."""
|
disabled when running with Inductor: level>=PIECEWISE and use_inductor=True.
|
||||||
|
Inductor generates (fused) Triton kernels for disabled custom ops."""
|
||||||
splitting_ops: list[str] = field(default_factory=list)
|
splitting_ops: list[str] = field(default_factory=list)
|
||||||
"""A list of ops to split the full graph into subgraphs, used in piecewise
|
"""A list of ops to split the full graph into subgraphs, used in piecewise
|
||||||
compilation."""
|
compilation."""
|
||||||
@ -4003,10 +4004,13 @@ class CompilationConfig:
|
|||||||
use_inductor: bool = True
|
use_inductor: bool = True
|
||||||
"""Whether to use inductor compilation:
|
"""Whether to use inductor compilation:
|
||||||
|
|
||||||
- False: inductor compilation is not used. graph runs in eager.
|
- False: inductor compilation is not used. graph runs in eager
|
||||||
- True: inductor compilation is used. one graph for symbolic shape
|
(custom_ops enabled by default).
|
||||||
is compiled. In addition, compile for compile_sizes,
|
- True: inductor compilation is used (custom_ops disabled by default).
|
||||||
using configurations in inductor_compile_config."""
|
One graph for symbolic shape and one graph per size in compile_sizes
|
||||||
|
are compiled using configurations in inductor_compile_config.
|
||||||
|
|
||||||
|
This setting is ignored if level<PIECEWISE."""
|
||||||
compile_sizes: Optional[list[Union[int, str]]] = None
|
compile_sizes: Optional[list[Union[int, str]]] = None
|
||||||
"""Sizes to compile for inductor. In addition
|
"""Sizes to compile for inductor. In addition
|
||||||
to integers, it also supports "cudagraph_capture_sizes" to
|
to integers, it also supports "cudagraph_capture_sizes" to
|
||||||
@ -4537,19 +4541,6 @@ class VllmConfig:
|
|||||||
self.compilation_config.level = CompilationLevel.PIECEWISE
|
self.compilation_config.level = CompilationLevel.PIECEWISE
|
||||||
self.compilation_config.set_splitting_ops_for_v1()
|
self.compilation_config.set_splitting_ops_for_v1()
|
||||||
|
|
||||||
# The behavior of custom ops with inductor depends on the config:
|
|
||||||
# - If use_inductor=True and custom_ops is empty:
|
|
||||||
# Inductor generates Triton kernels for all registered custom ops
|
|
||||||
# (default behavior)
|
|
||||||
# - If use_inductor=True and custom_ops is non-empty:
|
|
||||||
# Custom CUDA kernels are used for specified ops while inductor
|
|
||||||
# generates Triton kernels for remaining ops, including misc torch
|
|
||||||
# ops in the model.
|
|
||||||
if (not self.compilation_config.custom_ops
|
|
||||||
and self.compilation_config.use_inductor):
|
|
||||||
# Let inductor generate Triton kernels for the custom ops.
|
|
||||||
self.compilation_config.custom_ops = ["none"]
|
|
||||||
|
|
||||||
self._set_cudagraph_sizes()
|
self._set_cudagraph_sizes()
|
||||||
|
|
||||||
if self.cache_config.cpu_offload_gb > 0 and \
|
if self.cache_config.cpu_offload_gb > 0 and \
|
||||||
|
|||||||
@ -141,16 +141,16 @@ class CustomOp(nn.Module):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def default_on() -> bool:
|
def default_on() -> bool:
|
||||||
"""
|
"""
|
||||||
On by default if level < CompilationLevel.PIECEWISE
|
On by default if PyTorch Inductor is not used.
|
||||||
Specifying 'all' or 'none' in custom_op takes precedence.
|
Specifying 'all' or 'none' in custom_op takes precedence.
|
||||||
"""
|
"""
|
||||||
from vllm.config import CompilationLevel
|
from vllm.config import CompilationLevel
|
||||||
compilation_config = get_current_vllm_config().compilation_config
|
compilation_config = get_current_vllm_config().compilation_config
|
||||||
custom_ops = compilation_config.custom_ops
|
default_on = (compilation_config.level < CompilationLevel.PIECEWISE
|
||||||
count_none = custom_ops.count("none")
|
or not compilation_config.use_inductor)
|
||||||
count_all = custom_ops.count("all")
|
count_none = compilation_config.custom_ops.count("none")
|
||||||
return compilation_config.level < CompilationLevel.PIECEWISE and \
|
count_all = compilation_config.custom_ops.count("all")
|
||||||
not count_none > 0 or count_all > 0
|
return default_on and not count_none > 0 or count_all > 0
|
||||||
|
|
||||||
# Dictionary of all custom ops (classes, indexed by registered name).
|
# Dictionary of all custom ops (classes, indexed by registered name).
|
||||||
# To check if an op with a name is enabled, call .enabled() on the class.
|
# To check if an op with a name is enabled, call .enabled() on the class.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user