Scheduled removal of CompilationConfig.use_inductor (#29323)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-11-25 12:55:42 +00:00 committed by GitHub
parent bf0c75cd4f
commit 51fc9e017a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 13 additions and 41 deletions

View File

@ -55,7 +55,7 @@ class SillyModel(nn.Module):
def _run_simple_model( def _run_simple_model(
splitting_ops, splitting_ops,
use_inductor_graph_partition, use_inductor_graph_partition,
use_inductor, backend,
expected_num_piecewise_graphs_seen, expected_num_piecewise_graphs_seen,
expected_num_piecewise_capturable_graphs_seen, expected_num_piecewise_capturable_graphs_seen,
expected_num_backend_compilations, expected_num_backend_compilations,
@ -64,7 +64,7 @@ def _run_simple_model(
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
use_inductor=use_inductor, backend=backend,
splitting_ops=splitting_ops, splitting_ops=splitting_ops,
use_inductor_graph_partition=use_inductor_graph_partition, use_inductor_graph_partition=use_inductor_graph_partition,
cudagraph_copy_inputs=True, cudagraph_copy_inputs=True,
@ -124,14 +124,14 @@ def _run_simple_model(
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
@pytest.mark.parametrize("use_inductor", [True, False]) @pytest.mark.parametrize("backend", ["inductor", "eager"])
@torch.inference_mode() @torch.inference_mode()
@create_new_process_for_each_test("spawn") @create_new_process_for_each_test("spawn")
def test_simple_piecewise_compile(use_inductor): def test_simple_piecewise_compile(backend):
_run_simple_model( _run_simple_model(
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
use_inductor_graph_partition=False, use_inductor_graph_partition=False,
use_inductor=use_inductor, backend=backend,
# 2 * num_layers + 1 # 2 * num_layers + 1
expected_num_piecewise_graphs_seen=5, expected_num_piecewise_graphs_seen=5,
# 1 + num_layers # 1 + num_layers
@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
_run_simple_model( _run_simple_model(
splitting_ops=["silly::attention"], splitting_ops=["silly::attention"],
use_inductor_graph_partition=True, use_inductor_graph_partition=True,
use_inductor=True, backend="inductor",
# Since not splitting at fx graph level # Since not splitting at fx graph level
expected_num_piecewise_graphs_seen=1, expected_num_piecewise_graphs_seen=1,
# Since not splitting at fx graph level # Since not splitting at fx graph level

View File

@ -249,14 +249,13 @@ def test_compilation_config():
args = parser.parse_args( args = parser.parse_args(
[ [
"-O", "-O",
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
'"use_inductor": false}',
] ]
) )
assert ( assert (
args.compilation_config.mode == 3 args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and not args.compilation_config.use_inductor and args.compilation_config.backend == "eager"
) )
# set to string form of a dict # set to string form of a dict
@ -264,13 +263,13 @@ def test_compilation_config():
[ [
"--compilation-config=" "--compilation-config="
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
'"use_inductor": true}', '"backend": "inductor"}',
] ]
) )
assert ( assert (
args.compilation_config.mode == 3 args.compilation_config.mode == 3
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
and args.compilation_config.use_inductor and args.compilation_config.backend == "inductor"
) )

View File

@ -166,7 +166,7 @@ def test_dict_args(parser):
"--hf-overrides.key2.key4", "--hf-overrides.key2.key4",
"val3", "val3",
# Test compile config and compilation mode # Test compile config and compilation mode
"-O.use_inductor=true", "-O.use_inductor_graph_partition=true",
"-O.backend", "-O.backend",
"custom", "custom",
"-O1", "-O1",
@ -219,7 +219,7 @@ def test_dict_args(parser):
} }
assert parsed_args.compilation_config == { assert parsed_args.compilation_config == {
"mode": 1, "mode": 1,
"use_inductor": True, "use_inductor_graph_partition": True,
"backend": "custom", "backend": "custom",
"custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
} }

View File

@ -264,7 +264,6 @@ class CompilationConfig:
- [`cudagraph_copy_inputs`] - [`cudagraph_copy_inputs`]
[vllm.config.CompilationConfig.cudagraph_copy_inputs] [vllm.config.CompilationConfig.cudagraph_copy_inputs]
- Inductor compilation: - Inductor compilation:
- [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
- [`inductor_compile_config`] - [`inductor_compile_config`]
[vllm.config.CompilationConfig.inductor_compile_config] [vllm.config.CompilationConfig.inductor_compile_config]
@ -348,7 +347,7 @@ class CompilationConfig:
- 'none,+op1,+op2' to enable only op1 and op2 - 'none,+op1,+op2' to enable only op1 and op2
By default, all custom ops are enabled when running without Inductor and By default, all custom ops are enabled when running without Inductor and
disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
Inductor generates (fused) Triton kernels for disabled custom ops.""" Inductor generates (fused) Triton kernels for disabled custom ops."""
splitting_ops: list[str] | None = None splitting_ops: list[str] | None = None
"""A list of ops to exclude from cudagraphs, used in piecewise compilation. """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@ -374,24 +373,6 @@ class CompilationConfig:
Disabled by default until more models are supported/tested to work.""" Disabled by default until more models are supported/tested to work."""
# Inductor capture # Inductor capture
use_inductor: bool | None = None
"""
Whether to use inductor compilation.
This flag is deprecated and will be removed in the next release 0.12.0.
Please use the 'backend' option instead.
- False: inductor compilation is not used. graph runs in eager
(custom_ops enabled by default).
- True: inductor compilation is used (custom_ops disabled by default).
One graph for symbolic shape and one graph per size in compile_sizes
are compiled using configurations in inductor_compile_config.
This setting is ignored if mode<VLLM_COMPILE.
For future compatibility:
If use_inductor is True, backend="inductor" otherwise backend="eager".
"""
compile_sizes: list[int | str] | None = None compile_sizes: list[int | str] | None = None
"""Sizes to compile for inductor. In addition """Sizes to compile for inductor. In addition
to integers, it also supports "cudagraph_capture_sizes" to to integers, it also supports "cudagraph_capture_sizes" to
@ -759,14 +740,6 @@ class CompilationConfig:
f"Invalid backend for piecewise compilation: {self.backend}" f"Invalid backend for piecewise compilation: {self.backend}"
) )
if self.use_inductor is not None:
logger.warning_once(
"The 'use_inductor' flag is deprecated and will be "
"removed in the next release (v0.12.0). "
"Please use the 'backend' option instead.",
)
self.backend = "inductor" if self.use_inductor else "eager"
if self.backend == "": if self.backend == "":
self.backend = current_platform.get_compile_backend() self.backend = current_platform.get_compile_backend()