[CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
Li, Jiang 2025-08-28 19:57:05 +08:00 committed by GitHub
parent d99c3a4f7b
commit 67cee40da0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 14 additions and 14 deletions

View File

@ -49,23 +49,23 @@ function cpu_tests() {
# Run kernel tests # Run kernel tests
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -v -s tests/kernels/test_onednn.py" pytest -x -v -s tests/kernels/test_onednn.py"
# Run basic model test # Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
# Note: disable until supports V1 # Note: disable until supports V1
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
# Note: disable Bart until supports V1 # Note: disable Bart until supports V1
pytest -v -s tests/models/language/generation -m cpu_model \ pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py --ignore=tests/models/language/generation/test_bart.py
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
--ignore=tests/models/language/generation/test_bart.py --ignore=tests/models/language/generation/test_bart.py
pytest -v -s tests/models/language/pooling -m cpu_model pytest -x -v -s tests/models/language/pooling -m cpu_model
pytest -v -s tests/models/multimodal/generation \ pytest -x -v -s tests/models/multimodal/generation \
--ignore=tests/models/multimodal/generation/test_mllama.py \ --ignore=tests/models/multimodal/generation/test_mllama.py \
--ignore=tests/models/multimodal/generation/test_pixtral.py \ --ignore=tests/models/multimodal/generation/test_pixtral.py \
-m cpu_model" -m cpu_model"
@ -73,20 +73,20 @@ function cpu_tests() {
# Run compressed-tensor test # Run compressed-tensor test
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -x -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
# Note: disable it until supports V1 # Note: disable it until supports V1
# Run AWQ test # Run AWQ test
# docker exec cpu-test-"$NUMA_NODE" bash -c " # docker exec cpu-test-"$NUMA_NODE" bash -c "
# set -e # set -e
# VLLM_USE_V1=0 pytest -s -v \ # VLLM_USE_V1=0 pytest -x -s -v \
# tests/quantization/test_ipex_quant.py" # tests/quantization/test_ipex_quant.py"
# Run multi-lora tests # Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c " docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e set -e
pytest -s -v \ pytest -x -s -v \
tests/lora/test_qwen2vl.py" tests/lora/test_qwen2vl.py"
# online serving # online serving

View File

@ -507,9 +507,9 @@ def merge_multimodal_embeddings(
This updates ``inputs_embeds`` in place. This updates ``inputs_embeds`` in place.
""" """
if isinstance(placeholder_token_id, list): if isinstance(placeholder_token_id, list):
placeholder_token_id = torch.tensor(placeholder_token_id, placeholder_token_id = torch.tensor(
pin_memory=True).to( placeholder_token_id,
device=input_ids.device, pin_memory=is_pin_memory_available()).to(device=input_ids.device,
non_blocking=True) non_blocking=True)
return _merge_multimodal_embeddings( return _merge_multimodal_embeddings(
inputs_embeds, inputs_embeds,