From 51fc9e017a721c7fb283cecf3231bbe6e358132b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Nov 2025 12:55:42 +0000
Subject: [PATCH] Scheduled removal of `CompilationConfig.use_inductor`
 (#29323)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/compile/fullgraph/test_simple.py | 12 +++++------
 tests/engine/test_arg_utils.py         |  9 ++++----
 tests/utils_/test_argparse_utils.py    |  4 ++--
 vllm/config/compilation.py             | 29 +-------------------------
 4 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/tests/compile/fullgraph/test_simple.py b/tests/compile/fullgraph/test_simple.py
index e258133ab50a7..36cc1510ed798 100644
--- a/tests/compile/fullgraph/test_simple.py
+++ b/tests/compile/fullgraph/test_simple.py
@@ -55,7 +55,7 @@ class SillyModel(nn.Module):
 def _run_simple_model(
     splitting_ops,
     use_inductor_graph_partition,
-    use_inductor,
+    backend,
     expected_num_piecewise_graphs_seen,
     expected_num_piecewise_capturable_graphs_seen,
     expected_num_backend_compilations,
@@ -64,7 +64,7 @@ def _run_simple_model(
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_inductor=use_inductor,
+            backend=backend,
             splitting_ops=splitting_ops,
             use_inductor_graph_partition=use_inductor_graph_partition,
             cudagraph_copy_inputs=True,
@@ -124,14 +124,14 @@ def _run_simple_model(
         assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
 
 
-@pytest.mark.parametrize("use_inductor", [True, False])
+@pytest.mark.parametrize("backend", ["inductor", "eager"])
 @torch.inference_mode()
 @create_new_process_for_each_test("spawn")
-def test_simple_piecewise_compile(use_inductor):
+def test_simple_piecewise_compile(backend):
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=False,
-        use_inductor=use_inductor,
+        backend=backend,
         # 2 * num_layers + 1
         expected_num_piecewise_graphs_seen=5,
         # 1 + num_layers
@@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=True,
-        use_inductor=True,
+        backend="inductor",
         # Since not splitting at fx graph level
         expected_num_piecewise_graphs_seen=1,
         # Since not splitting at fx graph level
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 10827e3b4b9cd..93bc94123aaa9 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -249,14 +249,13 @@ def test_compilation_config():
     args = parser.parse_args(
         [
             "-O",
-            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": false}',
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
         ]
     )
     assert (
         args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and not args.compilation_config.use_inductor
+        and args.compilation_config.backend == "eager"
     )
 
     # set to string form of a dict
@@ -264,13 +263,13 @@ def test_compilation_config():
         [
             "--compilation-config="
             '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": true}',
+            '"backend": "inductor"}',
         ]
     )
     assert (
         args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and args.compilation_config.use_inductor
+        and args.compilation_config.backend == "inductor"
     )
 
 
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 3310753d2b6d6..32d4eca541356 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -166,7 +166,7 @@ def test_dict_args(parser):
         "--hf-overrides.key2.key4",
         "val3",
         # Test compile config and compilation mode
-        "-O.use_inductor=true",
+        "-O.use_inductor_graph_partition=true",
         "-O.backend",
         "custom",
         "-O1",
@@ -219,7 +219,7 @@ def test_dict_args(parser):
     }
     assert parsed_args.compilation_config == {
         "mode": 1,
-        "use_inductor": True,
+        "use_inductor_graph_partition": True,
         "backend": "custom",
         "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
     }
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 556b2d9168b32..865d045676d14 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -264,7 +264,6 @@ class CompilationConfig:
         - [`cudagraph_copy_inputs`]
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
     - Inductor compilation:
-        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
         - [`inductor_compile_config`]
         [vllm.config.CompilationConfig.inductor_compile_config]
@@ -348,7 +347,7 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
     Inductor generates (fused) Triton kernels for disabled custom ops."""
     splitting_ops: list[str] | None = None
     """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@@ -374,24 +373,6 @@ class CompilationConfig:
     Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
-    use_inductor: bool | None = None
-    """
-    Whether to use inductor compilation.
-
-    This flag is deprecated and will be removed in the next release 0.12.0.
-    Please use the 'backend' option instead.
-
-    - False: inductor compilation is not used. graph runs in eager
-        (custom_ops enabled by default).
-    - True: inductor compilation is used (custom_ops disabled by default).
-        One graph for symbolic shape and one graph per size in compile_sizes
-        are compiled using configurations in inductor_compile_config.
-
-    This setting is ignored if mode<VLLM_COMPILE.
-
-    For future compatibility:
-    If use_inductor is True, backend="inductor" otherwise backend="eager".
-    """
     compile_sizes: list[int | str] | None = None
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
@@ -759,14 +740,6 @@ class CompilationConfig:
                 f"Invalid backend for piecewise compilation: {self.backend}"
             )
 
-        if self.use_inductor is not None:
-            logger.warning_once(
-                "The 'use_inductor' flag is deprecated and will be "
-                "removed in the next release (v0.12.0). "
-                "Please use the 'backend' option instead.",
-            )
-            self.backend = "inductor" if self.use_inductor else "eager"
-
         if self.backend == "":
             self.backend = current_platform.get_compile_backend()