mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 14:05:39 +08:00
[CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
This commit is contained in:
parent
d99c3a4f7b
commit
67cee40da0
@ -49,23 +49,23 @@ function cpu_tests() {
|
|||||||
# Run kernel tests
|
# Run kernel tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -v -s tests/kernels/test_onednn.py"
|
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
# Note: disable until supports V1
|
# Note: disable until supports V1
|
||||||
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
|
|
||||||
# Note: disable Bart until supports V1
|
# Note: disable Bart until supports V1
|
||||||
pytest -v -s tests/models/language/generation -m cpu_model \
|
pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||||
--ignore=tests/models/language/generation/test_bart.py
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
|
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
|
||||||
--ignore=tests/models/language/generation/test_bart.py
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
|
||||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||||
pytest -v -s tests/models/multimodal/generation \
|
pytest -x -v -s tests/models/multimodal/generation \
|
||||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||||
-m cpu_model"
|
-m cpu_model"
|
||||||
@ -73,20 +73,20 @@ function cpu_tests() {
|
|||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
# set -e
|
# set -e
|
||||||
# VLLM_USE_V1=0 pytest -s -v \
|
# VLLM_USE_V1=0 pytest -x -s -v \
|
||||||
# tests/quantization/test_ipex_quant.py"
|
# tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -x -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
|
|
||||||
# online serving
|
# online serving
|
||||||
|
|||||||
@ -507,9 +507,9 @@ def merge_multimodal_embeddings(
|
|||||||
This updates ``inputs_embeds`` in place.
|
This updates ``inputs_embeds`` in place.
|
||||||
"""
|
"""
|
||||||
if isinstance(placeholder_token_id, list):
|
if isinstance(placeholder_token_id, list):
|
||||||
placeholder_token_id = torch.tensor(placeholder_token_id,
|
placeholder_token_id = torch.tensor(
|
||||||
pin_memory=True).to(
|
placeholder_token_id,
|
||||||
device=input_ids.device,
|
pin_memory=is_pin_memory_available()).to(device=input_ids.device,
|
||||||
non_blocking=True)
|
non_blocking=True)
|
||||||
return _merge_multimodal_embeddings(
|
return _merge_multimodal_embeddings(
|
||||||
inputs_embeds,
|
inputs_embeds,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user