mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-17 16:03:42 +08:00
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> (cherry picked from commit 30bb19a760d6d5e8c69b3a4c78c9cb7430872a61)
This commit is contained in:
parent
55f1fc1b1b
commit
2c0ee0fde8
@ -233,24 +233,6 @@ def test_splitting_ops_dynamic():
|
|||||||
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
||||||
|
|
||||||
|
|
||||||
def test_moe_splitting_ops_deepep_ht_piecewise():
|
|
||||||
# Non-inductor, non-attn-fusion case: DeepEP HT with dp>1
|
|
||||||
# should add MoE ops to splitting_ops on top of attention ops.
|
|
||||||
config = VllmConfig(
|
|
||||||
parallel_config=ParallelConfig(
|
|
||||||
all2all_backend="deepep_high_throughput",
|
|
||||||
data_parallel_size=8,
|
|
||||||
),
|
|
||||||
compilation_config=CompilationConfig(
|
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
splitting_ops = config.compilation_config.splitting_ops
|
|
||||||
assert splitting_ops is not None
|
|
||||||
assert "vllm::moe_forward" in splitting_ops
|
|
||||||
assert "vllm::moe_forward_shared" in splitting_ops
|
|
||||||
|
|
||||||
|
|
||||||
def test_moe_splitting_ops_deepep_ht_inductor_partition():
|
def test_moe_splitting_ops_deepep_ht_inductor_partition():
|
||||||
# Inductor partition case: user-provided splitting_ops should be
|
# Inductor partition case: user-provided splitting_ops should be
|
||||||
# preserved and MoE ops should be appended for DeepEP HT with dp>1.
|
# preserved and MoE ops should be appended for DeepEP HT with dp>1.
|
||||||
@ -277,26 +259,6 @@ def test_moe_splitting_ops_deepep_ht_inductor_partition():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
|
|
||||||
# Pure attn-fusion case without inductor partition: even with
|
|
||||||
# DeepEP HT and dp>1, we should not re-enable piecewise compilation
|
|
||||||
# or add MoE ops into splitting_ops.
|
|
||||||
config = VllmConfig(
|
|
||||||
parallel_config=ParallelConfig(
|
|
||||||
all2all_backend="deepep_high_throughput",
|
|
||||||
data_parallel_size=8,
|
|
||||||
),
|
|
||||||
compilation_config=CompilationConfig(
|
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
|
||||||
pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
|
|
||||||
custom_ops=["+quant_fp8"],
|
|
||||||
cudagraph_mode=CUDAGraphMode.PIECEWISE,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
assert config.compilation_config.splitting_ops == []
|
|
||||||
assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL
|
|
||||||
|
|
||||||
|
|
||||||
def test_should_split():
|
def test_should_split():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
|||||||
@ -915,8 +915,6 @@ class CompilationConfig:
|
|||||||
"mode is CompilationMode.VLLM_COMPILE"
|
"mode is CompilationMode.VLLM_COMPILE"
|
||||||
)
|
)
|
||||||
|
|
||||||
added_default_splitting_ops = False
|
|
||||||
|
|
||||||
if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
|
if self.pass_config.fuse_attn_quant and not self.use_inductor_graph_partition:
|
||||||
self.set_splitting_ops_for_attn_fusion()
|
self.set_splitting_ops_for_attn_fusion()
|
||||||
else:
|
else:
|
||||||
@ -930,7 +928,6 @@ class CompilationConfig:
|
|||||||
# for details. Make a copy to avoid mutating the class-level
|
# for details. Make a copy to avoid mutating the class-level
|
||||||
# list via reference.
|
# list via reference.
|
||||||
self.splitting_ops = list(self._attention_ops)
|
self.splitting_ops = list(self._attention_ops)
|
||||||
added_default_splitting_ops = True
|
|
||||||
elif len(self.splitting_ops) == 0:
|
elif len(self.splitting_ops) == 0:
|
||||||
if (
|
if (
|
||||||
self.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
self.cudagraph_mode == CUDAGraphMode.PIECEWISE
|
||||||
@ -958,44 +955,25 @@ class CompilationConfig:
|
|||||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||||
self.splitting_ops = []
|
self.splitting_ops = []
|
||||||
|
|
||||||
# split MoE ops for cudagraph
|
# Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
|
||||||
moe_ops = [
|
|
||||||
"vllm::moe_forward",
|
|
||||||
"vllm::moe_forward_shared",
|
|
||||||
]
|
|
||||||
backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
|
backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
|
||||||
dp_size = data_parallel_size if data_parallel_size is not None else 1
|
dp_size = data_parallel_size if data_parallel_size is not None else 1
|
||||||
need_moe_splitting = (
|
if (
|
||||||
backend == "deepep_high_throughput"
|
backend == "deepep_high_throughput"
|
||||||
and dp_size > 1
|
and dp_size > 1
|
||||||
# pure attn-fusion without inductor partition deliberately disables
|
and self.cudagraph_mode != CUDAGraphMode.NONE
|
||||||
# piecewise graphs and MoE splitting.
|
):
|
||||||
and not (
|
# TODO: Piecewise Cuda graph might be enabled
|
||||||
self.pass_config.fuse_attn_quant
|
# if torch compile cache key issue fixed
|
||||||
and not self.use_inductor_graph_partition
|
# See https://github.com/vllm-project/vllm/pull/25093
|
||||||
|
logger.info(
|
||||||
|
"DeepEP: Disabling CUDA Graphs since DeepEP high-throughput kernels "
|
||||||
|
"are optimized for prefill and are incompatible with CUDA Graphs. "
|
||||||
|
"In order to use CUDA Graphs for decode-optimized workloads, "
|
||||||
|
"use --all2all-backend with another option, such as "
|
||||||
|
"deepep_low_latency, pplx, or allgather_reducescatter."
|
||||||
)
|
)
|
||||||
)
|
self.cudagraph_mode = CUDAGraphMode.NONE
|
||||||
|
|
||||||
if need_moe_splitting and self.cudagraph_mode != CUDAGraphMode.NONE:
|
|
||||||
# if we just initialized default splitting_ops for this config,
|
|
||||||
# automatically append the MoE ops
|
|
||||||
if added_default_splitting_ops:
|
|
||||||
for op in moe_ops:
|
|
||||||
if op not in self.splitting_ops:
|
|
||||||
self.splitting_ops.append(op)
|
|
||||||
|
|
||||||
# make sure MoE ops are split out
|
|
||||||
if not any(op in self.splitting_ops for op in moe_ops):
|
|
||||||
self.cudagraph_mode = CUDAGraphMode.NONE
|
|
||||||
logger.warning_once(
|
|
||||||
"DeepEP high throughput backend with data_parallel_size > 1 "
|
|
||||||
"requires splitting MoE ops from cudagraphs. Please ensure "
|
|
||||||
"'vllm::moe_forward' or 'vllm::moe_forward_shared' are "
|
|
||||||
"present in CompilationConfig.splitting_ops."
|
|
||||||
)
|
|
||||||
elif self.cudagraph_mode.has_full_cudagraphs():
|
|
||||||
# fall back to piecewise when MoE splitting is required.
|
|
||||||
self.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
|
||||||
|
|
||||||
def set_splitting_ops_for_attn_fusion(self):
|
def set_splitting_ops_for_attn_fusion(self):
|
||||||
assert self.pass_config.fuse_attn_quant
|
assert self.pass_config.fuse_attn_quant
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user