[Ascend]: Fixed the issue where OOT Platform vllm-ascend could not enable SP in Eager mode (#28935)

Signed-off-by: leo-pony <nengjunma@outlook.com>
2026-07-28 22:47:54 +08:00 · 2025-12-02 04:02:18 +08:00 · 2025-12-02 04:02:18 +08:00 · eaf81485ed
commit eaf81485ed
parent 38caf7fa1a
2 changed files with 15 additions and 2 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@ -855,6 +855,13 @@ class CompilationConfig:
        self.compute_bs_to_padded_graph_size()
    def set_splitting_ops_for_v1(self):
        # To compatible with OOT hardware plugin platform (for example vllm-ascend)
        # which currently only supports sequence parallelism in eager mode.
        if self.mode != CompilationMode.VLLM_COMPILE:
            if self.splitting_ops is None:
                self.splitting_ops = []
            return
        # NOTE: this function needs to be called only when mode is
        # CompilationMode.VLLM_COMPILE
        assert self.mode == CompilationMode.VLLM_COMPILE, (
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@ -797,8 +797,7 @@ class VllmConfig:
        ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
        # Do this after all the updates to compilation_config.mode
-        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+        self.compilation_config.set_splitting_ops_for_v1()
            self.compilation_config.set_splitting_ops_for_v1()
        if self.compilation_config.pass_config.enable_sequence_parallelism:
            # With pipeline parallelism or dynamo partitioning,
@ -806,6 +805,13 @@ class VllmConfig:
            # Use custom rms norm to unblock. In the future,
            # the pass will operate on higher-level IR to avoid the issue.
            # TODO: https://github.com/vllm-project/vllm/issues/27894
            if self.compilation_config.mode != CompilationMode.VLLM_COMPILE:
                logger.warning(
                    "Sequence parallelism is enabled, but running in wrong "
                    "vllm compile mode: %s.",
                    self.compilation_config.mode,
                )
            is_fullgraph = (
                self.compilation_config.use_inductor_graph_partition
                or len(self.compilation_config.splitting_ops) == 0