[Bugfix] Guard _may_reorder_batch for encoder-only models on CPU (#24319) (#24348)

Signed-off-by: Remy <eunhwan.shin@dtonic.io>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
This commit is contained in:
Remy 2025-09-10 15:24:42 +09:00 committed by GitHub
parent 91130ae376
commit feaf202e93
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 26 additions and 7 deletions

View File

@ -27,11 +27,17 @@ from ...utils import check_embeddings_close
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.cpu_model]),
# [Encoder-only]
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
# [Cross-Encoder]
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
pytest.param(
"sentence-transformers/stsb-roberta-base-v2",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
],
)
def test_models(

View File

@ -3665,7 +3665,8 @@ class VllmConfig:
# logger should only print warning message for hybrid models. As we
# can't know whether the model is hybrid or not now, so we don't log
# warning message here and will log it later.
if not (current_platform.is_cuda() or current_platform.is_rocm()):
if not (current_platform.is_cuda() or current_platform.is_rocm()
or current_platform.is_cpu()):
# Hybrid KV cache manager is not supported on non-GPU platforms.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.kv_transfer_config is not None:

View File

@ -55,11 +55,23 @@ class CPUModelRunner(GPUModelRunner):
raise ValueError("Multiple KVCacheGroups is not"
"currently supported with CPU model runner.")
assert type(self.attn_groups[0]
[0].metadata_builder) is TorchSDPAMetadataBuilderV1
# Guard against encoder-only / pooling models where `attn_groups`
# may be empty or lack the expected metadata_builder.
# Without this check, accessing `attn_groups[0][0]` would trigger
# an AssertionError on CPU backend.
if not hasattr(self, "attn_groups") or not self.attn_groups:
return
if not self.attn_groups[0]:
return
self.attn_groups[0][0].metadata_builder.reorder_batch(
self.input_batch, scheduler_output)
mb = getattr(self.attn_groups[0][0], "metadata_builder", None)
if not isinstance(mb, TorchSDPAMetadataBuilderV1):
# Encoder-only / rerank models do not benefit from reordering,
# so we safely skip here.
return
# Safe path for decoder/attention-heavy models
mb.reorder_batch(self.input_batch, scheduler_output)
def _postprocess_tensors(self) -> None:
# Note: replace device tensors with cpu tensors