From 67cee40da035b7478483c76dfbe0bfc321c3822f Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Thu, 28 Aug 2025 19:57:05 +0800 Subject: [PATCH] [CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 20 +++++++++---------- vllm/model_executor/models/utils.py | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 9dec9f8e9eb3..8b8f0e8c6578 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -49,23 +49,23 @@ function cpu_tests() { # Run kernel tests docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -v -s tests/kernels/test_onednn.py" + pytest -x -v -s tests/kernels/test_onednn.py" # Run basic model test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e # Note: disable until supports V1 - # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model - # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model + # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model # Note: disable Bart until supports V1 - pytest -v -s tests/models/language/generation -m cpu_model \ + pytest -x -v -s tests/models/language/generation -m cpu_model \ --ignore=tests/models/language/generation/test_bart.py - VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ + VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \ --ignore=tests/models/language/generation/test_bart.py - pytest -v -s tests/models/language/pooling -m cpu_model - pytest -v -s tests/models/multimodal/generation \ + pytest -x -v -s tests/models/language/pooling -m cpu_model + pytest -x -v -s tests/models/multimodal/generation \ --ignore=tests/models/multimodal/generation/test_mllama.py \ --ignore=tests/models/multimodal/generation/test_pixtral.py \ -m cpu_model" @@ -73,20 +73,20 @@ function cpu_tests() { # Run compressed-tensor test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + pytest -x -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" # Note: disable it until supports V1 # Run AWQ test # docker exec cpu-test-"$NUMA_NODE" bash -c " # set -e - # VLLM_USE_V1=0 pytest -s -v \ + # VLLM_USE_V1=0 pytest -x -s -v \ # tests/quantization/test_ipex_quant.py" # Run multi-lora tests docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + pytest -x -s -v \ tests/lora/test_qwen2vl.py" # online serving diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 11e098f1d7bd..28cfefac30dd 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -507,10 +507,10 @@ def merge_multimodal_embeddings( This updates ``inputs_embeds`` in place. """ if isinstance(placeholder_token_id, list): - placeholder_token_id = torch.tensor(placeholder_token_id, - pin_memory=True).to( - device=input_ids.device, - non_blocking=True) + placeholder_token_id = torch.tensor( + placeholder_token_id, + pin_memory=is_pin_memory_available()).to(device=input_ids.device, + non_blocking=True) return _merge_multimodal_embeddings( inputs_embeds, torch.isin(input_ids, placeholder_token_id),