mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 02:05:01 +08:00
378 lines
13 KiB
Python
378 lines
13 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
import copy
|
|
from contextlib import nullcontext
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
from vllm.compilation.counter import compilation_counter
|
|
from vllm.compilation.fix_functionalization import FixFunctionalizationPass
|
|
from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
|
|
from vllm.config.compilation import CompilationMode
|
|
from vllm.engine.arg_utils import EngineArgs
|
|
from vllm.platforms import current_platform
|
|
from vllm.utils.torch_utils import _is_torch_equal_or_newer
|
|
|
|
# This import automatically registers `torch.ops.silly.attention`
|
|
from . import silly_attention # noqa: F401
|
|
|
|
|
|
def test_version():
|
|
# Test the version comparison logic using the private function
|
|
assert _is_torch_equal_or_newer("2.8.0.dev20250624+cu128", "2.8.0.dev")
|
|
assert _is_torch_equal_or_newer("2.8.0a0+gitc82a174", "2.8.0.dev")
|
|
assert _is_torch_equal_or_newer("2.8.0", "2.8.0.dev")
|
|
assert _is_torch_equal_or_newer("2.8.1", "2.8.0.dev")
|
|
assert not _is_torch_equal_or_newer("2.7.1", "2.8.0.dev")
|
|
|
|
|
|
def test_copy_pass():
|
|
vllm_config = VllmConfig()
|
|
inductor_pass = FixFunctionalizationPass(vllm_config)
|
|
copied_inductor_pass = copy.deepcopy(inductor_pass)
|
|
assert (
|
|
copied_inductor_pass.compilation_config.use_inductor_graph_partition
|
|
== vllm_config.compilation_config.use_inductor_graph_partition
|
|
)
|
|
assert (
|
|
copied_inductor_pass.compilation_config.splitting_ops
|
|
== vllm_config.compilation_config.splitting_ops
|
|
)
|
|
|
|
|
|
def test_custom_op():
|
|
# proper syntax
|
|
_ = CompilationConfig(custom_ops=["+quant_fp8", "-silu_and_mul"])
|
|
|
|
with pytest.raises(ValueError, match="Invalid syntax '"):
|
|
_ = CompilationConfig(custom_ops=["quant_fp8"])
|
|
|
|
|
|
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
|
@pytest.mark.forked
|
|
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
|
|
# on the state of the cache directory on the current machine, which
|
|
# may be influenced by other tests.
|
|
@pytest.mark.parametrize("val", ["1"])
|
|
def test_VLLM_DISABLE_COMPILE_CACHE(vllm_runner, monkeypatch, val):
|
|
# Disable multiprocessing so that the counter is in the same process
|
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
|
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", val)
|
|
|
|
compilation_config = {
|
|
"cudagraph_mode": CUDAGraphMode.NONE, # speed things up a bit
|
|
}
|
|
with (
|
|
compilation_counter.expect(
|
|
num_cache_entries_updated=0, num_compiled_artifacts_saved=0
|
|
),
|
|
# loading the model causes compilation (if enabled) to happen
|
|
vllm_runner(
|
|
"facebook/opt-125m",
|
|
compilation_config=compilation_config,
|
|
gpu_memory_utilization=0.4,
|
|
) as _,
|
|
):
|
|
pass
|
|
|
|
|
|
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
|
@pytest.mark.forked
|
|
@pytest.mark.parametrize(
|
|
"cudagraph_mode,num_cudagraph_captured",
|
|
[
|
|
(CUDAGraphMode.NONE, 0),
|
|
(CUDAGraphMode.FULL_DECODE_ONLY, 1),
|
|
(CUDAGraphMode.PIECEWISE, 13),
|
|
(CUDAGraphMode.FULL_AND_PIECEWISE, 14),
|
|
],
|
|
)
|
|
def test_use_cudagraphs(
|
|
vllm_runner, monkeypatch, cudagraph_mode, num_cudagraph_captured
|
|
):
|
|
# Disable multiprocessing so that the counter is in the same process
|
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
|
|
|
compilation_config = {
|
|
"cudagraph_capture_sizes": [100],
|
|
"cudagraph_mode": cudagraph_mode,
|
|
}
|
|
num_gpu_runner_capture_triggers = 1 if cudagraph_mode != CUDAGraphMode.NONE else 0
|
|
with (
|
|
compilation_counter.expect(
|
|
num_graphs_seen=1,
|
|
num_gpu_runner_capture_triggers=num_gpu_runner_capture_triggers,
|
|
num_cudagraph_captured=num_cudagraph_captured,
|
|
),
|
|
# loading the model causes compilation (if enabled) to happen
|
|
vllm_runner(
|
|
"facebook/opt-125m",
|
|
compilation_config=compilation_config,
|
|
gpu_memory_utilization=0.4,
|
|
) as _,
|
|
):
|
|
pass
|
|
|
|
|
|
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
|
@pytest.mark.forked
|
|
def test_stock_torch_compile(vllm_runner, monkeypatch):
|
|
# Disable multiprocessing so that the counter is in the same process
|
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
|
|
|
with (
|
|
compilation_counter.expect(stock_torch_compile_count=1),
|
|
# loading the model causes compilation (if enabled) to happen
|
|
vllm_runner(
|
|
"facebook/opt-125m",
|
|
compilation_config={"mode": CompilationMode.STOCK_TORCH_COMPILE},
|
|
gpu_memory_utilization=0.4,
|
|
) as _,
|
|
):
|
|
pass
|
|
|
|
|
|
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
|
@pytest.mark.forked
|
|
def test_no_compilation(vllm_runner, monkeypatch):
|
|
# Disable multiprocessing so that the counter is in the same process
|
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
|
with (
|
|
compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
|
|
# loading the model causes compilation (if enabled) to happen
|
|
vllm_runner(
|
|
"facebook/opt-125m",
|
|
compilation_config={"mode": CompilationMode.NONE},
|
|
gpu_memory_utilization=0.4,
|
|
) as _,
|
|
):
|
|
pass
|
|
|
|
|
|
# forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
|
|
@pytest.mark.forked
|
|
def test_enforce_eager(vllm_runner, monkeypatch):
|
|
# Disable multiprocessing so that the counter is in the same process
|
|
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
|
|
|
with (
|
|
compilation_counter.expect(num_graphs_seen=0, stock_torch_compile_count=0),
|
|
# loading the model causes compilation (if enabled) to happen
|
|
vllm_runner(
|
|
"facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.4
|
|
) as _,
|
|
):
|
|
pass
|
|
|
|
|
|
def test_splitting_ops_dynamic():
|
|
# Default config
|
|
config = VllmConfig()
|
|
# Default V1 config leaves cudagraph mode unset; splitting ops are only
|
|
# populated when the engine decides to use piecewise compilation.
|
|
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
|
|
assert not config.compilation_config.splitting_ops_contain_attention()
|
|
|
|
# When use_inductor_graph_partition=True
|
|
config = VllmConfig(
|
|
compilation_config=CompilationConfig(
|
|
mode=CompilationMode.VLLM_COMPILE,
|
|
use_inductor_graph_partition=True,
|
|
splitting_ops=["vllm::unified_attention"],
|
|
)
|
|
)
|
|
# with inductor partition we use splitting_ops directly for
|
|
# partition rules
|
|
assert config.compilation_config.splitting_ops == ["vllm::unified_attention"]
|
|
|
|
# When attn_fusion pass enabled.
|
|
config = VllmConfig(
|
|
compilation_config=CompilationConfig(
|
|
mode=CompilationMode.VLLM_COMPILE,
|
|
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
|
custom_ops=["+quant_fp8"],
|
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
|
)
|
|
)
|
|
assert config.compilation_config.splitting_ops == []
|
|
# cudagraph mode also fall back to FULL
|
|
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
|
|
|
|
# splitting_ops can not contain attention ops when attn_fusion
|
|
# pass enabled.
|
|
with pytest.raises(ValidationError):
|
|
config = VllmConfig(
|
|
compilation_config=CompilationConfig(
|
|
mode=CompilationMode.VLLM_COMPILE,
|
|
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
|
custom_ops=["+quant_fp8"],
|
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
|
# work around for accessing all attntion ops
|
|
splitting_ops=CompilationConfig()._attention_ops,
|
|
)
|
|
)
|
|
|
|
# When both use_inductor_graph_partition and attn_fusion pass enabled.
|
|
config = VllmConfig(
|
|
compilation_config=CompilationConfig(
|
|
mode=CompilationMode.VLLM_COMPILE,
|
|
use_inductor_graph_partition=True,
|
|
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
|
custom_ops=["+quant_fp8"],
|
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
|
)
|
|
)
|
|
# With inductor graph partition, attn_fusion and splitting_ops
|
|
# work together. Default splitting_ops include attention ops.
|
|
assert config.compilation_config.splitting_ops_contain_attention()
|
|
# enable_attn_fusion is directly supported under
|
|
# use_inductor_graph_partition=True, and cudagraph_mode
|
|
# is unchanged.
|
|
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
|
|
|
|
|
def test_should_split():
|
|
import torch
|
|
|
|
from vllm.compilation.partition_rules import should_split
|
|
|
|
graph = torch.fx.Graph()
|
|
node = torch.fx.Node(
|
|
graph=graph,
|
|
name="dummy_node",
|
|
op="call_function",
|
|
target=torch.ops.aten.add.default,
|
|
args=(),
|
|
kwargs={},
|
|
)
|
|
|
|
# supports OpOverloadPacket
|
|
splitting_ops = ["aten::add"]
|
|
assert should_split(node, splitting_ops)
|
|
|
|
# supports OpOverload
|
|
splitting_ops = ["aten::add.default"]
|
|
assert should_split(node, splitting_ops)
|
|
|
|
# supports OpOverload
|
|
splitting_ops = ["aten::add.Tensor"]
|
|
assert not should_split(node, splitting_ops)
|
|
|
|
q, k, v, out = [torch.randn(1)] * 4
|
|
|
|
# supports custom ops as OpOverloadPacket
|
|
node = torch.fx.Node(
|
|
graph=graph,
|
|
name="dummy_node",
|
|
op="call_function",
|
|
target=torch.ops.silly.attention,
|
|
args=(q, k, v, out),
|
|
kwargs={},
|
|
)
|
|
|
|
splitting_ops = ["silly::attention"]
|
|
assert should_split(node, splitting_ops)
|
|
|
|
# supports custom ops as OpOverload
|
|
node = torch.fx.Node(
|
|
graph=graph,
|
|
name="dummy_node",
|
|
op="call_function",
|
|
target=torch.ops.silly.attention.default,
|
|
args=(q, k, v, out),
|
|
kwargs={},
|
|
)
|
|
|
|
splitting_ops = ["silly::attention"]
|
|
assert should_split(node, splitting_ops)
|
|
|
|
splitting_ops = ["silly::attention.default"]
|
|
assert should_split(node, splitting_ops)
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
not current_platform.support_static_graph_mode(),
|
|
reason="Skip if not cudagraph mode supported",
|
|
)
|
|
@pytest.mark.parametrize(
|
|
(
|
|
"cudagraph_capture_sizes",
|
|
"max_cudagraph_capture_size",
|
|
"tp_size",
|
|
"enable_sequence_parallelism",
|
|
"max_num_batched_tokens",
|
|
"cudagraph_mode",
|
|
"expected_max_size",
|
|
),
|
|
[
|
|
(None, None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
|
|
([1, 2, 4], 4, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
|
|
(
|
|
[1, 2, 4],
|
|
8,
|
|
1,
|
|
False,
|
|
2048,
|
|
CUDAGraphMode.FULL_AND_PIECEWISE,
|
|
ValidationError,
|
|
),
|
|
([1, 256], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
|
|
([], None, 1, False, 2048, CUDAGraphMode.NONE, 0),
|
|
(None, 0, 1, False, 2048, CUDAGraphMode.NONE, 0),
|
|
# truncated to nearest multiple of 8 or 16
|
|
(None, 257, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 256),
|
|
# max from list
|
|
([1, 2, 4, 15], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 15),
|
|
# filtered out 15 due to SP
|
|
([1, 2, 4, 15], None, 2, True, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
|
|
# limited by the max_tokens
|
|
([1, 2, 4, 15], None, 1, False, 8, CUDAGraphMode.FULL_AND_PIECEWISE, 4),
|
|
# the list should contain at least 1 element when use cudagraph
|
|
([], None, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
|
|
# the max capturing size should be >= 1 when use cudagraph
|
|
(None, 0, 1, False, 2048, CUDAGraphMode.FULL_AND_PIECEWISE, ValidationError),
|
|
],
|
|
)
|
|
def test_cudagraph_sizes_post_init(
|
|
cudagraph_capture_sizes,
|
|
max_cudagraph_capture_size,
|
|
tp_size,
|
|
enable_sequence_parallelism,
|
|
max_num_batched_tokens,
|
|
cudagraph_mode,
|
|
expected_max_size,
|
|
):
|
|
ctx = nullcontext()
|
|
if expected_max_size == ValidationError:
|
|
ctx = pytest.raises(expected_max_size)
|
|
|
|
with (
|
|
ctx,
|
|
patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
|
|
):
|
|
compilation_config = CompilationConfig(
|
|
cudagraph_capture_sizes=cudagraph_capture_sizes,
|
|
max_cudagraph_capture_size=max_cudagraph_capture_size,
|
|
pass_config={
|
|
"enable_sequence_parallelism": enable_sequence_parallelism,
|
|
"enable_fusion": True,
|
|
"enable_noop": True,
|
|
},
|
|
cudagraph_mode=cudagraph_mode,
|
|
)
|
|
engine_args = EngineArgs(
|
|
model="facebook/opt-125m",
|
|
tensor_parallel_size=tp_size,
|
|
max_num_seqs=min(max_num_batched_tokens, 128),
|
|
max_num_batched_tokens=max_num_batched_tokens,
|
|
compilation_config=compilation_config,
|
|
)
|
|
vllm_config = engine_args.create_engine_config()
|
|
|
|
assert (
|
|
vllm_config.compilation_config.max_cudagraph_capture_size
|
|
== expected_max_size
|
|
)
|