mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 05:04:58 +08:00
Signed-off-by: Remy <eunhwan.shin@dtonic.io> Co-authored-by: Li, Jiang <jiang1.li@intel.com>
This commit is contained in:
parent
91130ae376
commit
feaf202e93
@ -27,11 +27,17 @@ from ...utils import check_embeddings_close
|
||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
||||
marks=[pytest.mark.cpu_model]),
|
||||
# [Encoder-only]
|
||||
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
|
||||
pytest.param(
|
||||
"BAAI/bge-base-en-v1.5",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
pytest.param("intfloat/multilingual-e5-small"),
|
||||
# [Cross-Encoder]
|
||||
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
|
||||
pytest.param(
|
||||
"sentence-transformers/stsb-roberta-base-v2",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_models(
|
||||
|
||||
@ -3665,7 +3665,8 @@ class VllmConfig:
|
||||
# logger should only print warning message for hybrid models. As we
|
||||
# can't know whether the model is hybrid or not now, so we don't log
|
||||
# warning message here and will log it later.
|
||||
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
if not (current_platform.is_cuda() or current_platform.is_rocm()
|
||||
or current_platform.is_cpu()):
|
||||
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||
if self.kv_transfer_config is not None:
|
||||
|
||||
@ -55,11 +55,23 @@ class CPUModelRunner(GPUModelRunner):
|
||||
raise ValueError("Multiple KVCacheGroups is not"
|
||||
"currently supported with CPU model runner.")
|
||||
|
||||
assert type(self.attn_groups[0]
|
||||
[0].metadata_builder) is TorchSDPAMetadataBuilderV1
|
||||
# Guard against encoder-only / pooling models where `attn_groups`
|
||||
# may be empty or lack the expected metadata_builder.
|
||||
# Without this check, accessing `attn_groups[0][0]` would trigger
|
||||
# an AssertionError on CPU backend.
|
||||
if not hasattr(self, "attn_groups") or not self.attn_groups:
|
||||
return
|
||||
if not self.attn_groups[0]:
|
||||
return
|
||||
|
||||
self.attn_groups[0][0].metadata_builder.reorder_batch(
|
||||
self.input_batch, scheduler_output)
|
||||
mb = getattr(self.attn_groups[0][0], "metadata_builder", None)
|
||||
if not isinstance(mb, TorchSDPAMetadataBuilderV1):
|
||||
# Encoder-only / rerank models do not benefit from reordering,
|
||||
# so we safely skip here.
|
||||
return
|
||||
|
||||
# Safe path for decoder/attention-heavy models
|
||||
mb.reorder_batch(self.input_batch, scheduler_output)
|
||||
|
||||
def _postprocess_tensors(self) -> None:
|
||||
# Note: replace device tensors with cpu tensors
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user