mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 00:05:48 +08:00
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
ed540d6d4c
commit
8f18feb191
@ -355,13 +355,13 @@ def test_toy_llama(
|
|||||||
)
|
)
|
||||||
|
|
||||||
compile_config_no_compile = CompilationConfig(
|
compile_config_no_compile = CompilationConfig(
|
||||||
level=CompilationMode.NONE,
|
mode=CompilationMode.NONE,
|
||||||
cudagraph_mode=CUDAGraphMode.NONE,
|
cudagraph_mode=CUDAGraphMode.NONE,
|
||||||
backend="eager",
|
backend="eager",
|
||||||
)
|
)
|
||||||
|
|
||||||
compile_config_no_split = CompilationConfig(
|
compile_config_no_split = CompilationConfig(
|
||||||
level=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_inductor_graph_partition=use_inductor_graph_partition,
|
use_inductor_graph_partition=use_inductor_graph_partition,
|
||||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||||
backend=backend,
|
backend=backend,
|
||||||
|
|||||||
@ -38,7 +38,7 @@ class CompiledMod(torch.nn.Module):
|
|||||||
def make_vllm_config() -> VllmConfig:
|
def make_vllm_config() -> VllmConfig:
|
||||||
return VllmConfig(
|
return VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -168,7 +168,7 @@ def test_splitting_ops_dynamic():
|
|||||||
if is_torch_equal_or_newer("2.9.0.dev"):
|
if is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
config = VllmConfig(
|
config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
splitting_ops=["vllm::unified_attention"],
|
splitting_ops=["vllm::unified_attention"],
|
||||||
)
|
)
|
||||||
@ -180,7 +180,7 @@ def test_splitting_ops_dynamic():
|
|||||||
# When attn_fusion pass enabled, splitting_ops now default to attention ops.
|
# When attn_fusion pass enabled, splitting_ops now default to attention ops.
|
||||||
config = VllmConfig(
|
config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
||||||
custom_ops=["+quant_fp8"],
|
custom_ops=["+quant_fp8"],
|
||||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
||||||
@ -195,7 +195,7 @@ def test_splitting_ops_dynamic():
|
|||||||
if is_torch_equal_or_newer("2.9.0.dev"):
|
if is_torch_equal_or_newer("2.9.0.dev"):
|
||||||
config = VllmConfig(
|
config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
level=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
use_inductor_graph_partition=True,
|
use_inductor_graph_partition=True,
|
||||||
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
pass_config={"enable_attn_fusion": True, "enable_noop": True},
|
||||||
custom_ops=["+quant_fp8"],
|
custom_ops=["+quant_fp8"],
|
||||||
|
|||||||
@ -198,7 +198,7 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
|
|||||||
compilation_config = (
|
compilation_config = (
|
||||||
compile_config
|
compile_config
|
||||||
if isinstance(compile_config, CompilationConfig)
|
if isinstance(compile_config, CompilationConfig)
|
||||||
else CompilationConfig(level=compile_config)
|
else CompilationConfig(mode=compile_config)
|
||||||
)
|
)
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
|
|||||||
@ -151,7 +151,7 @@ def test_attn_quant(
|
|||||||
cudagraph_mode=mode,
|
cudagraph_mode=mode,
|
||||||
splitting_ops=splitting_ops,
|
splitting_ops=splitting_ops,
|
||||||
# Common
|
# Common
|
||||||
level=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
|
pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True),
|
||||||
# Inductor caches custom passes by default as well via uuid
|
# Inductor caches custom passes by default as well via uuid
|
||||||
inductor_compile_config={"force_disable_caches": True},
|
inductor_compile_config={"force_disable_caches": True},
|
||||||
@ -236,7 +236,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
|
|||||||
custom_ops=custom_ops_list,
|
custom_ops=custom_ops_list,
|
||||||
splitting_ops=splitting_ops,
|
splitting_ops=splitting_ops,
|
||||||
# Common
|
# Common
|
||||||
level=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
pass_config=PassConfig(
|
pass_config=PassConfig(
|
||||||
enable_attn_fusion=True,
|
enable_attn_fusion=True,
|
||||||
enable_noop=True,
|
enable_noop=True,
|
||||||
@ -273,7 +273,7 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
|
|||||||
compilation_config = (
|
compilation_config = (
|
||||||
compile_config
|
compile_config
|
||||||
if isinstance(compile_config, CompilationConfig)
|
if isinstance(compile_config, CompilationConfig)
|
||||||
else CompilationConfig(level=compile_config)
|
else CompilationConfig(mode=compile_config)
|
||||||
)
|
)
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
|
|||||||
@ -36,7 +36,7 @@ class Relu3(ReLUSquaredActivation):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"env, torch_level, backend, ops_enabled, default_on",
|
"env, compilation_mode, backend, ops_enabled, default_on",
|
||||||
[
|
[
|
||||||
# Default values based on compile level
|
# Default values based on compile level
|
||||||
# - All by default (no Inductor compilation)
|
# - All by default (no Inductor compilation)
|
||||||
@ -77,7 +77,7 @@ class Relu3(ReLUSquaredActivation):
|
|||||||
)
|
)
|
||||||
def test_enabled_ops(
|
def test_enabled_ops(
|
||||||
env: str | None,
|
env: str | None,
|
||||||
torch_level: int,
|
compilation_mode: int,
|
||||||
backend: str,
|
backend: str,
|
||||||
ops_enabled: list[int],
|
ops_enabled: list[int],
|
||||||
default_on: bool,
|
default_on: bool,
|
||||||
@ -85,7 +85,7 @@ def test_enabled_ops(
|
|||||||
custom_ops = env.split(",") if env else []
|
custom_ops = env.split(",") if env else []
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
backend=backend, level=torch_level, custom_ops=custom_ops
|
backend=backend, mode=compilation_mode, custom_ops=custom_ops
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user