mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-17 04:55:30 +08:00
Signed-off-by: Remy <eunhwan.shin@dtonic.io> Co-authored-by: Li, Jiang <jiang1.li@intel.com>
This commit is contained in:
parent
91130ae376
commit
feaf202e93
@ -27,11 +27,17 @@ from ...utils import check_embeddings_close
|
|||||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
|
||||||
marks=[pytest.mark.cpu_model]),
|
marks=[pytest.mark.cpu_model]),
|
||||||
# [Encoder-only]
|
# [Encoder-only]
|
||||||
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
|
pytest.param(
|
||||||
|
"BAAI/bge-base-en-v1.5",
|
||||||
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||||
|
),
|
||||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||||
pytest.param("intfloat/multilingual-e5-small"),
|
pytest.param("intfloat/multilingual-e5-small"),
|
||||||
# [Cross-Encoder]
|
# [Cross-Encoder]
|
||||||
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
|
pytest.param(
|
||||||
|
"sentence-transformers/stsb-roberta-base-v2",
|
||||||
|
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_models(
|
def test_models(
|
||||||
|
|||||||
@ -3665,7 +3665,8 @@ class VllmConfig:
|
|||||||
# logger should only print warning message for hybrid models. As we
|
# logger should only print warning message for hybrid models. As we
|
||||||
# can't know whether the model is hybrid or not now, so we don't log
|
# can't know whether the model is hybrid or not now, so we don't log
|
||||||
# warning message here and will log it later.
|
# warning message here and will log it later.
|
||||||
if not (current_platform.is_cuda() or current_platform.is_rocm()):
|
if not (current_platform.is_cuda() or current_platform.is_rocm()
|
||||||
|
or current_platform.is_cpu()):
|
||||||
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
# Hybrid KV cache manager is not supported on non-GPU platforms.
|
||||||
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
self.scheduler_config.disable_hybrid_kv_cache_manager = True
|
||||||
if self.kv_transfer_config is not None:
|
if self.kv_transfer_config is not None:
|
||||||
|
|||||||
@ -55,11 +55,23 @@ class CPUModelRunner(GPUModelRunner):
|
|||||||
raise ValueError("Multiple KVCacheGroups is not"
|
raise ValueError("Multiple KVCacheGroups is not"
|
||||||
"currently supported with CPU model runner.")
|
"currently supported with CPU model runner.")
|
||||||
|
|
||||||
assert type(self.attn_groups[0]
|
# Guard against encoder-only / pooling models where `attn_groups`
|
||||||
[0].metadata_builder) is TorchSDPAMetadataBuilderV1
|
# may be empty or lack the expected metadata_builder.
|
||||||
|
# Without this check, accessing `attn_groups[0][0]` would trigger
|
||||||
|
# an AssertionError on CPU backend.
|
||||||
|
if not hasattr(self, "attn_groups") or not self.attn_groups:
|
||||||
|
return
|
||||||
|
if not self.attn_groups[0]:
|
||||||
|
return
|
||||||
|
|
||||||
self.attn_groups[0][0].metadata_builder.reorder_batch(
|
mb = getattr(self.attn_groups[0][0], "metadata_builder", None)
|
||||||
self.input_batch, scheduler_output)
|
if not isinstance(mb, TorchSDPAMetadataBuilderV1):
|
||||||
|
# Encoder-only / rerank models do not benefit from reordering,
|
||||||
|
# so we safely skip here.
|
||||||
|
return
|
||||||
|
|
||||||
|
# Safe path for decoder/attention-heavy models
|
||||||
|
mb.reorder_batch(self.input_batch, scheduler_output)
|
||||||
|
|
||||||
def _postprocess_tensors(self) -> None:
|
def _postprocess_tensors(self) -> None:
|
||||||
# Note: replace device tensors with cpu tensors
|
# Note: replace device tensors with cpu tensors
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user