[Bugfix] Guard _may_reorder_batch for encoder-only models on CPU (#24319) (#24348)

Signed-off-by: Remy <eunhwan.shin@dtonic.io> Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-12-17 04:55:30 +08:00 · 2025-09-10 15:24:42 +09:00 · 2025-09-10 15:24:42 +09:00 · feaf202e93
commit feaf202e93
parent 91130ae376
3 changed files with 26 additions and 7 deletions
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@ -27,11 +27,17 @@ from ...utils import check_embeddings_close
        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
                     marks=[pytest.mark.cpu_model]),
        # [Encoder-only]
-        pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
+        pytest.param(
            "BAAI/bge-base-en-v1.5",
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
        pytest.param("intfloat/multilingual-e5-small"),
        # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
+        pytest.param(
            "sentence-transformers/stsb-roberta-base-v2",
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
    ],
 )
 def test_models(
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@ -3665,7 +3665,8 @@ class VllmConfig:
            # logger should only print warning message for hybrid models. As we
            # can't know whether the model is hybrid or not now, so we don't log
            # warning message here and will log it later.
-            if not (current_platform.is_cuda() or current_platform.is_rocm()):
+            if not (current_platform.is_cuda() or current_platform.is_rocm()
                    or current_platform.is_cpu()):
                # Hybrid KV cache manager is not supported on non-GPU platforms.
                self.scheduler_config.disable_hybrid_kv_cache_manager = True
            if self.kv_transfer_config is not None:
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@ -55,11 +55,23 @@ class CPUModelRunner(GPUModelRunner):
            raise ValueError("Multiple KVCacheGroups is not"
                             "currently supported with CPU model runner.")
-        assert type(self.attn_groups[0]
+        # Guard against encoder-only / pooling models where `attn_groups`
-                    [0].metadata_builder) is TorchSDPAMetadataBuilderV1
+        # may be empty or lack the expected metadata_builder.
        # Without this check, accessing `attn_groups[0][0]` would trigger
        # an AssertionError on CPU backend.
        if not hasattr(self, "attn_groups") or not self.attn_groups:
            return
        if not self.attn_groups[0]:
            return
-        self.attn_groups[0][0].metadata_builder.reorder_batch(
+        mb = getattr(self.attn_groups[0][0], "metadata_builder", None)
-            self.input_batch, scheduler_output)
+        if not isinstance(mb, TorchSDPAMetadataBuilderV1):
            # Encoder-only / rerank models do not benefit from reordering,
            # so we safely skip here.
            return
        # Safe path for decoder/attention-heavy models
        mb.reorder_batch(self.input_batch, scheduler_output)
    def _postprocess_tensors(self) -> None:
        # Note: replace device tensors with cpu tensors