From 143844fa43d4851831e89200c9a6069c929f8882 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Mon, 29 Sep 2025 13:15:10 +0800 Subject: [PATCH] [XPU]Fix xpu spec decoding UTs, avoid using cuda graph (#25847) Signed-off-by: Kunshang Ji --- .buildkite/scripts/hardware_ci/run-xpu-test.sh | 2 +- tests/utils.py | 2 ++ vllm/v1/spec_decode/eagle.py | 7 ++++--- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 6b9c0121c4aa..2fd7265fa536 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -42,7 +42,7 @@ docker run \ pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py + pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py pytest -v -s v1/test_metrics pytest -v -s v1/test_serial_utils.py diff --git a/tests/utils.py b/tests/utils.py index ab6ccc7ad9f9..ffdc0f732543 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1143,6 +1143,8 @@ def get_attn_backend_list_based_on_platform() -> list[str]: print("Skip FLASH_ATTN on ROCm as aiter is not installed") return attn_backend_list + elif current_platform.is_xpu(): + return ["FLASH_ATTN", "TRITON_ATTN"] else: raise ValueError("Unsupported platform") diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 1b5bafb9ca1b..a2f7dbe5703f 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -72,12 +72,13 @@ class EagleProposer: self.attn_metadata_builder: Optional[AttentionMetadataBuilder] = None - self.use_cuda_graph = (self.vllm_config.compilation_config.level + self.use_cuda_graph = (not current_platform.is_xpu() + and self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not self.vllm_config.model_config.enforce_eager) self.cudagraph_batch_sizes = list( - reversed( - self.vllm_config.compilation_config.cudagraph_capture_sizes)) + reversed(self.vllm_config.compilation_config. + cudagraph_capture_sizes)) if self.use_cuda_graph else [] # persistent buffers for cuda graph self.input_ids = torch.zeros(self.max_num_tokens,