mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-12 04:17:13 +08:00
Scheduled removal of CompilationConfig.use_inductor (#29323)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
bf0c75cd4f
commit
51fc9e017a
@ -55,7 +55,7 @@ class SillyModel(nn.Module):
|
|||||||
def _run_simple_model(
|
def _run_simple_model(
|
||||||
splitting_ops,
|
splitting_ops,
|
||||||
use_inductor_graph_partition,
|
use_inductor_graph_partition,
|
||||||
use_inductor,
|
backend,
|
||||||
expected_num_piecewise_graphs_seen,
|
expected_num_piecewise_graphs_seen,
|
||||||
expected_num_piecewise_capturable_graphs_seen,
|
expected_num_piecewise_capturable_graphs_seen,
|
||||||
expected_num_backend_compilations,
|
expected_num_backend_compilations,
|
||||||
@ -64,7 +64,7 @@ def _run_simple_model(
|
|||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_inductor=use_inductor,
|
backend=backend,
|
||||||
splitting_ops=splitting_ops,
|
splitting_ops=splitting_ops,
|
||||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
cudagraph_copy_inputs=True,
|
cudagraph_copy_inputs=True,
|
||||||
@ -124,14 +124,14 @@ def _run_simple_model(
|
|||||||
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_inductor", [True, False])
|
@pytest.mark.parametrize("backend", ["inductor", "eager"])
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
@create_new_process_for_each_test("spawn")
|
@create_new_process_for_each_test("spawn")
|
||||||
def test_simple_piecewise_compile(use_inductor):
|
def test_simple_piecewise_compile(backend):
|
||||||
_run_simple_model(
|
_run_simple_model(
|
||||||
splitting_ops=["silly::attention"],
|
splitting_ops=["silly::attention"],
|
||||||
use_inductor_graph_partition=False,
|
use_inductor_graph_partition=False,
|
||||||
use_inductor=use_inductor,
|
backend=backend,
|
||||||
# 2 * num_layers + 1
|
# 2 * num_layers + 1
|
||||||
expected_num_piecewise_graphs_seen=5,
|
expected_num_piecewise_graphs_seen=5,
|
||||||
# 1 + num_layers
|
# 1 + num_layers
|
||||||
@ -155,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
|
|||||||
_run_simple_model(
|
_run_simple_model(
|
||||||
splitting_ops=["silly::attention"],
|
splitting_ops=["silly::attention"],
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
use_inductor=True,
|
backend="inductor",
|
||||||
# Since not splitting at fx graph level
|
# Since not splitting at fx graph level
|
||||||
expected_num_piecewise_graphs_seen=1,
|
expected_num_piecewise_graphs_seen=1,
|
||||||
# Since not splitting at fx graph level
|
# Since not splitting at fx graph level
|
||||||
|
|||||||
@ -249,14 +249,13 @@ def test_compilation_config():
|
|||||||
args = parser.parse_args(
|
args = parser.parse_args(
|
||||||
[
|
[
|
||||||
"-O",
|
"-O",
|
||||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
|
||||||
'"use_inductor": false}',
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
args.compilation_config.mode == 3
|
args.compilation_config.mode == 3
|
||||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||||
and not args.compilation_config.use_inductor
|
and args.compilation_config.backend == "eager"
|
||||||
)
|
)
|
||||||
|
|
||||||
# set to string form of a dict
|
# set to string form of a dict
|
||||||
@ -264,13 +263,13 @@ def test_compilation_config():
|
|||||||
[
|
[
|
||||||
"--compilation-config="
|
"--compilation-config="
|
||||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
|
||||||
'"use_inductor": true}',
|
'"backend": "inductor"}',
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
assert (
|
assert (
|
||||||
args.compilation_config.mode == 3
|
args.compilation_config.mode == 3
|
||||||
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
|
||||||
and args.compilation_config.use_inductor
|
and args.compilation_config.backend == "inductor"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -166,7 +166,7 @@ def test_dict_args(parser):
|
|||||||
"--hf-overrides.key2.key4",
|
"--hf-overrides.key2.key4",
|
||||||
"val3",
|
"val3",
|
||||||
# Test compile config and compilation mode
|
# Test compile config and compilation mode
|
||||||
"-O.use_inductor=true",
|
"-O.use_inductor_graph_partition=true",
|
||||||
"-O.backend",
|
"-O.backend",
|
||||||
"custom",
|
"custom",
|
||||||
"-O1",
|
"-O1",
|
||||||
@ -219,7 +219,7 @@ def test_dict_args(parser):
|
|||||||
}
|
}
|
||||||
assert parsed_args.compilation_config == {
|
assert parsed_args.compilation_config == {
|
||||||
"mode": 1,
|
"mode": 1,
|
||||||
"use_inductor": True,
|
"use_inductor_graph_partition": True,
|
||||||
"backend": "custom",
|
"backend": "custom",
|
||||||
"custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
|
"custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
|
||||||
}
|
}
|
||||||
|
|||||||
@ -264,7 +264,6 @@ class CompilationConfig:
|
|||||||
- [`cudagraph_copy_inputs`]
|
- [`cudagraph_copy_inputs`]
|
||||||
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
|
[vllm.config.CompilationConfig.cudagraph_copy_inputs]
|
||||||
- Inductor compilation:
|
- Inductor compilation:
|
||||||
- [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
|
|
||||||
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
|
- [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
|
||||||
- [`inductor_compile_config`]
|
- [`inductor_compile_config`]
|
||||||
[vllm.config.CompilationConfig.inductor_compile_config]
|
[vllm.config.CompilationConfig.inductor_compile_config]
|
||||||
@ -348,7 +347,7 @@ class CompilationConfig:
|
|||||||
- 'none,+op1,+op2' to enable only op1 and op2
|
- 'none,+op1,+op2' to enable only op1 and op2
|
||||||
|
|
||||||
By default, all custom ops are enabled when running without Inductor and
|
By default, all custom ops are enabled when running without Inductor and
|
||||||
disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
|
disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
|
||||||
Inductor generates (fused) Triton kernels for disabled custom ops."""
|
Inductor generates (fused) Triton kernels for disabled custom ops."""
|
||||||
splitting_ops: list[str] | None = None
|
splitting_ops: list[str] | None = None
|
||||||
"""A list of ops to exclude from cudagraphs, used in piecewise compilation.
|
"""A list of ops to exclude from cudagraphs, used in piecewise compilation.
|
||||||
@ -374,24 +373,6 @@ class CompilationConfig:
|
|||||||
Disabled by default until more models are supported/tested to work."""
|
Disabled by default until more models are supported/tested to work."""
|
||||||
|
|
||||||
# Inductor capture
|
# Inductor capture
|
||||||
use_inductor: bool | None = None
|
|
||||||
"""
|
|
||||||
Whether to use inductor compilation.
|
|
||||||
|
|
||||||
This flag is deprecated and will be removed in the next release 0.12.0.
|
|
||||||
Please use the 'backend' option instead.
|
|
||||||
|
|
||||||
- False: inductor compilation is not used. graph runs in eager
|
|
||||||
(custom_ops enabled by default).
|
|
||||||
- True: inductor compilation is used (custom_ops disabled by default).
|
|
||||||
One graph for symbolic shape and one graph per size in compile_sizes
|
|
||||||
are compiled using configurations in inductor_compile_config.
|
|
||||||
|
|
||||||
This setting is ignored if mode<VLLM_COMPILE.
|
|
||||||
|
|
||||||
For future compatibility:
|
|
||||||
If use_inductor is True, backend="inductor" otherwise backend="eager".
|
|
||||||
"""
|
|
||||||
compile_sizes: list[int | str] | None = None
|
compile_sizes: list[int | str] | None = None
|
||||||
"""Sizes to compile for inductor. In addition
|
"""Sizes to compile for inductor. In addition
|
||||||
to integers, it also supports "cudagraph_capture_sizes" to
|
to integers, it also supports "cudagraph_capture_sizes" to
|
||||||
@ -759,14 +740,6 @@ class CompilationConfig:
|
|||||||
f"Invalid backend for piecewise compilation: {self.backend}"
|
f"Invalid backend for piecewise compilation: {self.backend}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.use_inductor is not None:
|
|
||||||
logger.warning_once(
|
|
||||||
"The 'use_inductor' flag is deprecated and will be "
|
|
||||||
"removed in the next release (v0.12.0). "
|
|
||||||
"Please use the 'backend' option instead.",
|
|
||||||
)
|
|
||||||
self.backend = "inductor" if self.use_inductor else "eager"
|
|
||||||
|
|
||||||
if self.backend == "":
|
if self.backend == "":
|
||||||
self.backend = current_platform.get_compile_backend()
|
self.backend = current_platform.get_compile_backend()
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user