diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index a94215ee397bf..140f00294765d 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -28,42 +28,49 @@ class Relu3(ReLUSquaredActivation): @pytest.mark.parametrize( - "env, torch_level, ops_enabled, default_on", + "env, torch_level, use_inductor, ops_enabled, default_on", [ # Default values based on compile level - ("", 0, [True] * 4, True), - ("", 1, [True] * 4, True), - ("", 2, [True] * 4, True), # All by default - ("", 3, [False] * 4, False), - ("", 4, [False] * 4, False), # None by default + # - All by default (no Inductor compilation) + ("", 0, False, [True] * 4, True), + ("", 1, True, [True] * 4, True), + ("", 2, False, [True] * 4, True), + # - None by default (with Inductor) + ("", 3, True, [False] * 4, False), + ("", 4, True, [False] * 4, False), + # - All by default (without Inductor) + ("", 3, False, [True] * 4, True), + ("", 4, False, [True] * 4, True), # Explicitly enabling/disabling # # Default: all # # All but SiluAndMul - ("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True), + ("+rms_norm,-silu_and_mul", 0, True, [1, 0, 1, 1], True), # Only ReLU3 - ("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False), + ("none,-rms_norm,+relu3", 1, False, [0, 0, 0, 1], False), # All but SiluAndMul - ("all,-silu_and_mul", 1, [1, 0, 1, 1], True), + ("all,-silu_and_mul", 2, True, [1, 0, 1, 1], True), # All but ReLU3 (even if ReLU2 is on) - ("-relu3,relu2", 1, [1, 1, 1, 0], True), - # GeluAndMul and SiluAndMul - ("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False), + ("-relu3,relu2", 3, False, [1, 1, 1, 0], True), + # RMSNorm and SiluAndMul + ("none,-relu3,+rms_norm,+silu_and_mul", 4, False, [1, 1, 0, 0], False), # All but RMSNorm - ("-rms_norm", 2, [0, 1, 1, 1], True), + ("-rms_norm", 3, False, [0, 1, 1, 1], True), # # Default: none # # Only ReLU3 - ("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False), + ("-silu_and_mul,+relu3", 3, True, [0, 0, 0, 1], False), # All but RMSNorm - ("all,-rms_norm", 4, [0, 1, 1, 1], True), + ("all,-rms_norm", 4, True, [0, 1, 1, 1], True), ]) -def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int], - default_on: bool): - vllm_config = VllmConfig(compilation_config=CompilationConfig( - level=torch_level, custom_ops=env.split(","))) +def test_enabled_ops(env: str, torch_level: int, use_inductor: bool, + ops_enabled: list[int], default_on: bool): + vllm_config = VllmConfig( + compilation_config=CompilationConfig(use_inductor=bool(use_inductor), + level=torch_level, + custom_ops=env.split(","))) with set_current_vllm_config(vllm_config): assert CustomOp.default_on() == default_on diff --git a/vllm/config.py b/vllm/config.py index 623ba3aaf1093..84aa14b7c8605 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3994,7 +3994,8 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor (compile_level >= Inductor).""" + disabled when running with Inductor: level>=PIECEWISE and use_inductor=True. + Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] = field(default_factory=list) """A list of ops to split the full graph into subgraphs, used in piecewise compilation.""" @@ -4003,10 +4004,13 @@ class CompilationConfig: use_inductor: bool = True """Whether to use inductor compilation: - - False: inductor compilation is not used. graph runs in eager. - - True: inductor compilation is used. one graph for symbolic shape - is compiled. In addition, compile for compile_sizes, - using configurations in inductor_compile_config.""" + - False: inductor compilation is not used. graph runs in eager + (custom_ops enabled by default). + - True: inductor compilation is used (custom_ops disabled by default). + One graph for symbolic shape and one graph per size in compile_sizes + are compiled using configurations in inductor_compile_config. + + This setting is ignored if level 0 and \ diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 1680b723d6a29..9c88721fb2782 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -141,16 +141,16 @@ class CustomOp(nn.Module): @staticmethod def default_on() -> bool: """ - On by default if level < CompilationLevel.PIECEWISE + On by default if PyTorch Inductor is not used. Specifying 'all' or 'none' in custom_op takes precedence. """ from vllm.config import CompilationLevel compilation_config = get_current_vllm_config().compilation_config - custom_ops = compilation_config.custom_ops - count_none = custom_ops.count("none") - count_all = custom_ops.count("all") - return compilation_config.level < CompilationLevel.PIECEWISE and \ - not count_none > 0 or count_all > 0 + default_on = (compilation_config.level < CompilationLevel.PIECEWISE + or not compilation_config.use_inductor) + count_none = compilation_config.custom_ops.count("none") + count_all = compilation_config.custom_ops.count("all") + return default_on and not count_none > 0 or count_all > 0 # Dictionary of all custom ops (classes, indexed by registered name). # To check if an op with a name is enabled, call .enabled() on the class.