mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 01:55:36 +08:00
[Frontend] Remap -O to -cc commandline flag (#29557)
Signed-off-by: Yanan Cao <gmagogsfm@gmail.com> Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
parent
fecae12cd7
commit
3461e7efd8
@ -35,7 +35,7 @@ docker run \
|
|||||||
echo $ZE_AFFINITY_MASK
|
echo $ZE_AFFINITY_MASK
|
||||||
pip install tblib==3.1.0
|
pip install tblib==3.1.0
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
|
|||||||
@ -8,9 +8,9 @@ TL;DR:
|
|||||||
| Online Flag | Offline Flag | Result |
|
| Online Flag | Offline Flag | Result |
|
||||||
|----------|----------|-------------|
|
|----------|----------|-------------|
|
||||||
| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
|
| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
|
||||||
| -O.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
|
| -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
|
||||||
| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
|
| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
|
||||||
| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |
|
| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |
|
||||||
|
|
||||||
## vLLM-torch.compile overview
|
## vLLM-torch.compile overview
|
||||||
|
|
||||||
@ -86,11 +86,11 @@ LLM(model, enforce_eager=True)
|
|||||||
```
|
```
|
||||||
|
|
||||||
To turn off just torch.compile, pass `mode = NONE` to the compilation config.
|
To turn off just torch.compile, pass `mode = NONE` to the compilation config.
|
||||||
(`-O` is short for `--compilation_config`):
|
(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Online
|
# Online
|
||||||
vllm serve -O.mode=0
|
vllm serve -cc.mode=0
|
||||||
```
|
```
|
||||||
|
|
||||||
```py
|
```py
|
||||||
@ -103,7 +103,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Online
|
# Online
|
||||||
vllm serve -O.cudagraph_mode=NONE
|
vllm serve -cc.cudagraph_mode=NONE
|
||||||
```
|
```
|
||||||
|
|
||||||
```py
|
```py
|
||||||
@ -183,10 +183,10 @@ help debug the issue:
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Online - using unbacked mode
|
# Online - using unbacked mode
|
||||||
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
|
vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
|
||||||
|
|
||||||
# Online - using backed_size_oblivious mode
|
# Online - using backed_size_oblivious mode
|
||||||
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=backed_size_oblivious
|
vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=backed_size_oblivious
|
||||||
```
|
```
|
||||||
|
|
||||||
```py
|
```py
|
||||||
@ -233,7 +233,7 @@ to the compilation config:
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
# online
|
# online
|
||||||
vllm serve -O.backend=eager
|
vllm serve -cc.backend=eager
|
||||||
```
|
```
|
||||||
|
|
||||||
```py
|
```py
|
||||||
@ -252,7 +252,7 @@ You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output
|
|||||||
### Editable TorchInductor code
|
### Editable TorchInductor code
|
||||||
|
|
||||||
You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
|
You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
|
||||||
or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
|
or passing `-cc.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
|
||||||
|
|
||||||
This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
|
This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
|
||||||
and print statements in the output code.
|
and print statements in the output code.
|
||||||
@ -299,7 +299,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
|
|||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Online
|
# Online
|
||||||
vllm serve -O.cudagraph_mode=NONE
|
vllm serve -cc.cudagraph_mode=NONE
|
||||||
```
|
```
|
||||||
|
|
||||||
```py
|
```py
|
||||||
|
|||||||
@ -117,7 +117,7 @@ vllm serve meta-llama/Llama-3.2-1B \
|
|||||||
|
|
||||||
|
|
||||||
# Alternative: Using dot notation (simpler for single values)
|
# Alternative: Using dot notation (simpler for single values)
|
||||||
vllm serve meta-llama/Llama-3.2-1B -O.dynamic_shapes_config.type=unbacked
|
vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Choosing the Right Mode
|
#### Choosing the Right Mode
|
||||||
|
|||||||
@ -115,7 +115,7 @@ def test_compile_correctness(
|
|||||||
str(pp_size),
|
str(pp_size),
|
||||||
"-tp",
|
"-tp",
|
||||||
str(tp_size),
|
str(tp_size),
|
||||||
"-O.cudagraph_mode=none",
|
"-cc.cudagraph_mode=none",
|
||||||
]
|
]
|
||||||
|
|
||||||
all_args: list[list[str]] = []
|
all_args: list[list[str]] = []
|
||||||
@ -128,7 +128,7 @@ def test_compile_correctness(
|
|||||||
]:
|
]:
|
||||||
for mode in [CompilationMode.NONE, comp_mode]:
|
for mode in [CompilationMode.NONE, comp_mode]:
|
||||||
all_args.append(
|
all_args.append(
|
||||||
final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"]
|
final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
|
||||||
)
|
)
|
||||||
|
|
||||||
# inductor will change the output, so we only compare if the output
|
# inductor will change the output, so we only compare if the output
|
||||||
@ -148,7 +148,7 @@ def test_compile_correctness(
|
|||||||
CompilationMode.DYNAMO_TRACE_ONCE,
|
CompilationMode.DYNAMO_TRACE_ONCE,
|
||||||
CompilationMode.VLLM_COMPILE,
|
CompilationMode.VLLM_COMPILE,
|
||||||
]:
|
]:
|
||||||
all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"])
|
all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
|
||||||
all_envs.append({})
|
all_envs.append({})
|
||||||
all_envs.append({})
|
all_envs.append({})
|
||||||
|
|
||||||
|
|||||||
@ -248,15 +248,15 @@ def test_optimization_level(args, expected):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("args", "expected"),
|
("args", "expected"),
|
||||||
[
|
[
|
||||||
(["-O.mode=0"], 0),
|
(["-cc.mode=0"], 0),
|
||||||
(["-O.mode=1"], 1),
|
(["-cc.mode=1"], 1),
|
||||||
(["-O.mode=2"], 2),
|
(["-cc.mode=2"], 2),
|
||||||
(["-O.mode=3"], 3),
|
(["-cc.mode=3"], 3),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_mode_parser(args, expected):
|
def test_mode_parser(args, expected):
|
||||||
"""
|
"""
|
||||||
Test compilation config modes (-O.mode=int) map to compilation_config.
|
Test compilation config modes (-cc.mode=int) map to compilation_config.
|
||||||
"""
|
"""
|
||||||
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
|
||||||
parsed_args = parser.parse_args(args)
|
parsed_args = parser.parse_args(args)
|
||||||
@ -273,7 +273,7 @@ def test_compilation_config():
|
|||||||
# set to string form of a dict
|
# set to string form of a dict
|
||||||
args = parser.parse_args(
|
args = parser.parse_args(
|
||||||
[
|
[
|
||||||
"-O",
|
"-cc",
|
||||||
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
|
'{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
@ -27,7 +27,7 @@ def parser():
|
|||||||
parser.add_argument("--batch-size", type=int)
|
parser.add_argument("--batch-size", type=int)
|
||||||
parser.add_argument("--enable-feature", action="store_true")
|
parser.add_argument("--enable-feature", action="store_true")
|
||||||
parser.add_argument("--hf-overrides", type=json.loads)
|
parser.add_argument("--hf-overrides", type=json.loads)
|
||||||
parser.add_argument("-O", "--compilation-config", type=json.loads)
|
parser.add_argument("-cc", "--compilation-config", type=json.loads)
|
||||||
parser.add_argument("--optimization-level", type=int)
|
parser.add_argument("--optimization-level", type=int)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
@ -167,8 +167,8 @@ def test_dict_args(parser):
|
|||||||
"--hf-overrides.key2.key4",
|
"--hf-overrides.key2.key4",
|
||||||
"val3",
|
"val3",
|
||||||
# Test compile config and compilation mode
|
# Test compile config and compilation mode
|
||||||
"-O.use_inductor_graph_partition=true",
|
"-cc.use_inductor_graph_partition=true",
|
||||||
"-O.backend",
|
"-cc.backend",
|
||||||
"custom",
|
"custom",
|
||||||
"-O1",
|
"-O1",
|
||||||
# Test = sign
|
# Test = sign
|
||||||
@ -191,9 +191,9 @@ def test_dict_args(parser):
|
|||||||
"--hf_overrides.key14.key15",
|
"--hf_overrides.key14.key15",
|
||||||
"-minus.and.dot",
|
"-minus.and.dot",
|
||||||
# Test array values
|
# Test array values
|
||||||
"-O.custom_ops+",
|
"-cc.custom_ops+",
|
||||||
"-quant_fp8",
|
"-quant_fp8",
|
||||||
"-O.custom_ops+=+silu_mul,-rms_norm",
|
"-cc.custom_ops+=+silu_mul,-rms_norm",
|
||||||
]
|
]
|
||||||
parsed_args = parser.parse_args(args)
|
parsed_args = parser.parse_args(args)
|
||||||
assert parsed_args.model_name == "something.something"
|
assert parsed_args.model_name == "something.something"
|
||||||
@ -234,7 +234,7 @@ def test_duplicate_dict_args(caplog_vllm, parser):
|
|||||||
"--hf-overrides.key1",
|
"--hf-overrides.key1",
|
||||||
"val2",
|
"val2",
|
||||||
"-O1",
|
"-O1",
|
||||||
"-O.mode",
|
"-cc.mode",
|
||||||
"2",
|
"2",
|
||||||
"-O3",
|
"-O3",
|
||||||
]
|
]
|
||||||
@ -380,29 +380,29 @@ def test_load_config_file(tmp_path):
|
|||||||
|
|
||||||
|
|
||||||
def test_compilation_mode_string_values(parser):
|
def test_compilation_mode_string_values(parser):
|
||||||
"""Test that -O.mode accepts both integer and string mode values."""
|
"""Test that -cc.mode accepts both integer and string mode values."""
|
||||||
args = parser.parse_args(["-O.mode", "0"])
|
args = parser.parse_args(["-cc.mode", "0"])
|
||||||
assert args.compilation_config == {"mode": 0}
|
assert args.compilation_config == {"mode": 0}
|
||||||
|
|
||||||
args = parser.parse_args(["-O3"])
|
args = parser.parse_args(["-O3"])
|
||||||
assert args.optimization_level == 3
|
assert args.optimization_level == 3
|
||||||
|
|
||||||
args = parser.parse_args(["-O.mode=NONE"])
|
args = parser.parse_args(["-cc.mode=NONE"])
|
||||||
assert args.compilation_config == {"mode": "NONE"}
|
assert args.compilation_config == {"mode": "NONE"}
|
||||||
|
|
||||||
args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"])
|
args = parser.parse_args(["-cc.mode", "STOCK_TORCH_COMPILE"])
|
||||||
assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}
|
assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}
|
||||||
|
|
||||||
args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"])
|
args = parser.parse_args(["-cc.mode=DYNAMO_TRACE_ONCE"])
|
||||||
assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}
|
assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}
|
||||||
|
|
||||||
args = parser.parse_args(["-O.mode", "VLLM_COMPILE"])
|
args = parser.parse_args(["-cc.mode", "VLLM_COMPILE"])
|
||||||
assert args.compilation_config == {"mode": "VLLM_COMPILE"}
|
assert args.compilation_config == {"mode": "VLLM_COMPILE"}
|
||||||
|
|
||||||
args = parser.parse_args(["-O.mode=none"])
|
args = parser.parse_args(["-cc.mode=none"])
|
||||||
assert args.compilation_config == {"mode": "none"}
|
assert args.compilation_config == {"mode": "none"}
|
||||||
|
|
||||||
args = parser.parse_args(["-O.mode=vllm_compile"])
|
args = parser.parse_args(["-cc.mode=vllm_compile"])
|
||||||
assert args.compilation_config == {"mode": "vllm_compile"}
|
assert args.compilation_config == {"mode": "vllm_compile"}
|
||||||
|
|
||||||
|
|
||||||
@ -458,3 +458,25 @@ def test_flat_product():
|
|||||||
(3, 4, "a", 5, 6),
|
(3, 4, "a", 5, 6),
|
||||||
(3, 4, "b", 5, 6),
|
(3, 4, "b", 5, 6),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_o_legacy_syntax_deprecation(caplog_vllm):
|
||||||
|
"""Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
parser.add_argument("-cc", "--compilation-config", type=json.loads)
|
||||||
|
|
||||||
|
# Test that -O.backend gets converted correctly AND emits warning
|
||||||
|
args = parser.parse_args(["-O.backend=eager"])
|
||||||
|
assert args.compilation_config == {"backend": "eager"}
|
||||||
|
|
||||||
|
# Check that deprecation warning was logged
|
||||||
|
assert len(caplog_vllm.records) >= 1
|
||||||
|
assert (
|
||||||
|
"The -O.* dotted syntax for --compilation-config is deprecated"
|
||||||
|
in caplog_vllm.text
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test that -O.mode gets converted correctly
|
||||||
|
# Note: warning_once won't emit again in same session
|
||||||
|
args = parser.parse_args(["-O.mode=2"])
|
||||||
|
assert args.compilation_config == {"mode": 2}
|
||||||
|
|||||||
@ -194,7 +194,7 @@ class VllmConfig:
|
|||||||
"""`torch.compile` and cudagraph capture configuration for the model.
|
"""`torch.compile` and cudagraph capture configuration for the model.
|
||||||
|
|
||||||
As a shorthand, one can append compilation arguments via
|
As a shorthand, one can append compilation arguments via
|
||||||
-0.parameter=argument such as `-O.mode=3` (same as `-O='{"mode":3}'`).
|
-cc.parameter=argument such as `-cc.mode=3` (same as `-cc='{"mode":3}'`).
|
||||||
|
|
||||||
You can specify the full compilation config like so:
|
You can specify the full compilation config like so:
|
||||||
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
`{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
|
||||||
|
|||||||
@ -1107,7 +1107,7 @@ class EngineArgs:
|
|||||||
"--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
|
"--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
|
||||||
)
|
)
|
||||||
vllm_group.add_argument(
|
vllm_group.add_argument(
|
||||||
"--compilation-config", "-O", **vllm_kwargs["compilation_config"]
|
"--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
|
||||||
)
|
)
|
||||||
vllm_group.add_argument(
|
vllm_group.add_argument(
|
||||||
"--additional-config", **vllm_kwargs["additional_config"]
|
"--additional-config", **vllm_kwargs["additional_config"]
|
||||||
|
|||||||
@ -257,6 +257,17 @@ class FlexibleArgumentParser(ArgumentParser):
|
|||||||
):
|
):
|
||||||
# Convert -O <n> to --optimization-level <n>
|
# Convert -O <n> to --optimization-level <n>
|
||||||
processed_args.append("--optimization-level")
|
processed_args.append("--optimization-level")
|
||||||
|
elif arg.startswith("-O."):
|
||||||
|
# Handle -O.* dotted syntax - ALL dotted syntax is deprecated
|
||||||
|
logger.warning_once(
|
||||||
|
"The -O.* dotted syntax for --compilation-config is "
|
||||||
|
"deprecated and will be removed in v0.13.0 or v1.0.0"
|
||||||
|
", whichever is earlier. Please use -cc.* instead. "
|
||||||
|
"Example: -cc.backend=eager instead of "
|
||||||
|
"-O.backend=eager."
|
||||||
|
)
|
||||||
|
converted_arg = arg.replace("-O", "-cc", 1)
|
||||||
|
processed_args.append(converted_arg)
|
||||||
else:
|
else:
|
||||||
processed_args.append(arg)
|
processed_args.append(arg)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user