[CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-12-10 15:14:53 +08:00 · 2025-08-28 19:57:05 +08:00 · 2025-08-28 19:57:05 +08:00 · 67cee40da0
commit 67cee40da0
parent d99c3a4f7b
2 changed files with 14 additions and 14 deletions
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -49,23 +49,23 @@ function cpu_tests() {
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -v -s tests/kernels/test_onednn.py"
+    pytest -x -v -s tests/kernels/test_onednn.py"
  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
-    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
    # Note: disable Bart until supports V1
-    pytest -v -s tests/models/language/generation -m cpu_model \
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
-    pytest -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation \
+    pytest -x -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
@ -73,20 +73,20 @@ function cpu_tests() {
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    pytest -x -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
  # Note: disable it until supports V1
  # Run AWQ test
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
  #   set -e
-  #   VLLM_USE_V1=0 pytest -s -v \
+  #   VLLM_USE_V1=0 pytest -x -s -v \
  #   tests/quantization/test_ipex_quant.py"
  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -s -v \
+    pytest -x -s -v \
    tests/lora/test_qwen2vl.py"
  # online serving
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@ -507,9 +507,9 @@ def merge_multimodal_embeddings(
        This updates ``inputs_embeds`` in place.
    """
    if isinstance(placeholder_token_id, list):
-        placeholder_token_id = torch.tensor(placeholder_token_id,
+        placeholder_token_id = torch.tensor(
-                                            pin_memory=True).to(
+            placeholder_token_id,
-                                                device=input_ids.device,
+            pin_memory=is_pin_memory_available()).to(device=input_ids.device,
                                                     non_blocking=True)
        return _merge_multimodal_embeddings(
            inputs_embeds,