From 51fc9e017a721c7fb283cecf3231bbe6e358132b Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 25 Nov 2025 12:55:42 +0000 Subject: [PATCH] Scheduled removal of `CompilationConfig.use_inductor` (#29323) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/compile/fullgraph/test_simple.py | 12 +++++------ tests/engine/test_arg_utils.py | 9 ++++---- tests/utils_/test_argparse_utils.py | 4 ++-- vllm/config/compilation.py | 29 +------------------------- 4 files changed, 13 insertions(+), 41 deletions(-) diff --git a/tests/compile/fullgraph/test_simple.py b/tests/compile/fullgraph/test_simple.py index e258133ab50a7..36cc1510ed798 100644 --- a/tests/compile/fullgraph/test_simple.py +++ b/tests/compile/fullgraph/test_simple.py @@ -55,7 +55,7 @@ class SillyModel(nn.Module): def _run_simple_model( splitting_ops, use_inductor_graph_partition, - use_inductor, + backend, expected_num_piecewise_graphs_seen, expected_num_piecewise_capturable_graphs_seen, expected_num_backend_compilations, @@ -64,7 +64,7 @@ def _run_simple_model( vllm_config = VllmConfig( compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - use_inductor=use_inductor, + backend=backend, splitting_ops=splitting_ops, use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_copy_inputs=True, @@ -124,14 +124,14 @@ def _run_simple_model( assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) -@pytest.mark.parametrize("use_inductor", [True, False]) +@pytest.mark.parametrize("backend", ["inductor", "eager"]) @torch.inference_mode() @create_new_process_for_each_test("spawn") -def test_simple_piecewise_compile(use_inductor): +def test_simple_piecewise_compile(backend): _run_simple_model( splitting_ops=["silly::attention"], use_inductor_graph_partition=False, - use_inductor=use_inductor, + backend=backend, # 2 * num_layers + 1 expected_num_piecewise_graphs_seen=5, # 1 + num_layers @@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch): _run_simple_model( splitting_ops=["silly::attention"], use_inductor_graph_partition=True, - use_inductor=True, + backend="inductor", # Since not splitting at fx graph level expected_num_piecewise_graphs_seen=1, # Since not splitting at fx graph level diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 10827e3b4b9cd..93bc94123aaa9 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -249,14 +249,13 @@ def test_compilation_config(): args = parser.parse_args( [ "-O", - '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' - '"use_inductor": false}', + '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}', ] ) assert ( args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] - and not args.compilation_config.use_inductor + and args.compilation_config.backend == "eager" ) # set to string form of a dict @@ -264,13 +263,13 @@ def test_compilation_config(): [ "--compilation-config=" '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], ' - '"use_inductor": true}', + '"backend": "inductor"}', ] ) assert ( args.compilation_config.mode == 3 and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8] - and args.compilation_config.use_inductor + and args.compilation_config.backend == "inductor" ) diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py index 3310753d2b6d6..32d4eca541356 100644 --- a/tests/utils_/test_argparse_utils.py +++ b/tests/utils_/test_argparse_utils.py @@ -166,7 +166,7 @@ def test_dict_args(parser): "--hf-overrides.key2.key4", "val3", # Test compile config and compilation mode - "-O.use_inductor=true", + "-O.use_inductor_graph_partition=true", "-O.backend", "custom", "-O1", @@ -219,7 +219,7 @@ def test_dict_args(parser): } assert parsed_args.compilation_config == { "mode": 1, - "use_inductor": True, + "use_inductor_graph_partition": True, "backend": "custom", "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"], } diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 556b2d9168b32..865d045676d14 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -264,7 +264,6 @@ class CompilationConfig: - [`cudagraph_copy_inputs`] [vllm.config.CompilationConfig.cudagraph_copy_inputs] - Inductor compilation: - - [`use_inductor`][vllm.config.CompilationConfig.use_inductor] - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes] - [`inductor_compile_config`] [vllm.config.CompilationConfig.inductor_compile_config] @@ -348,7 +347,7 @@ class CompilationConfig: - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and - disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True. + disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor". Inductor generates (fused) Triton kernels for disabled custom ops.""" splitting_ops: list[str] | None = None """A list of ops to exclude from cudagraphs, used in piecewise compilation. @@ -374,24 +373,6 @@ class CompilationConfig: Disabled by default until more models are supported/tested to work.""" # Inductor capture - use_inductor: bool | None = None - """ - Whether to use inductor compilation. - - This flag is deprecated and will be removed in the next release 0.12.0. - Please use the 'backend' option instead. - - - False: inductor compilation is not used. graph runs in eager - (custom_ops enabled by default). - - True: inductor compilation is used (custom_ops disabled by default). - One graph for symbolic shape and one graph per size in compile_sizes - are compiled using configurations in inductor_compile_config. - - This setting is ignored if mode