mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-11 04:54:59 +08:00
[Bug] Fix torch Compilation Cache Hit Error (#25093)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
75fb112d80
commit
d2a30a2d93
@ -563,18 +563,6 @@ class CompilationConfig:
|
|||||||
self.cudagraph_mode = CUDAGraphMode.FULL
|
self.cudagraph_mode = CUDAGraphMode.FULL
|
||||||
self.splitting_ops = []
|
self.splitting_ops = []
|
||||||
|
|
||||||
if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
|
|
||||||
# exclude MoE dispatch/combine from capture by ensuring
|
|
||||||
# piecewise splitting includes them, so communication remains
|
|
||||||
# outside CUDA graphs while compute can still be graphed.
|
|
||||||
moe_ops = [
|
|
||||||
"vllm.moe_forward",
|
|
||||||
"vllm.moe_forward_shared",
|
|
||||||
]
|
|
||||||
for op in moe_ops:
|
|
||||||
if op not in self.splitting_ops:
|
|
||||||
self.splitting_ops.append(op)
|
|
||||||
|
|
||||||
def splitting_ops_contain_attention(self) -> bool:
|
def splitting_ops_contain_attention(self) -> bool:
|
||||||
return self.splitting_ops is not None and all(
|
return self.splitting_ops is not None and all(
|
||||||
op in self.splitting_ops for op in self._attention_ops)
|
op in self.splitting_ops for op in self._attention_ops)
|
||||||
|
|||||||
@ -191,14 +191,17 @@ class CudaPlatformBase(Platform):
|
|||||||
compilation_config = vllm_config.compilation_config
|
compilation_config = vllm_config.compilation_config
|
||||||
if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
|
if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
|
||||||
and parallel_config.data_parallel_size > 1
|
and parallel_config.data_parallel_size > 1
|
||||||
and compilation_config.cudagraph_mode
|
and compilation_config.cudagraph_mode != CUDAGraphMode.NONE):
|
||||||
not in [CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE]):
|
# TODO: Piecewise Cuda graph might be enabled
|
||||||
|
# if torch compile cache key issue fixed
|
||||||
|
# See https://github.com/vllm-project/vllm/pull/25093
|
||||||
logger.info(
|
logger.info(
|
||||||
"Data Parallel with DeepEP high-throughput: using PIECEWISE "
|
"Data Parallel: disabling cudagraphs since DP "
|
||||||
"CUDA graphs and excluding MoE ops from capture. Set "
|
"with DeepEP high-throughput kernels are not CUDA Graph "
|
||||||
"VLLM_ALL2ALL_BACKEND=deepep_low_latency if you need MoE "
|
"compatible. The DeepEP low-latency kernels are CUDA Graph "
|
||||||
"graphs captured as well.")
|
"compatible. Set the all_to_all backend to deepep_low_latency "
|
||||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
"to use those kernels instead.")
|
||||||
|
compilation_config.cudagraph_mode = CUDAGraphMode.NONE
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_current_memory_usage(cls,
|
def get_current_memory_usage(cls,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user