[docs][misc] IOProcessor plugins fixes (#24046)

Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
2026-06-07 03:55:41 +08:00 · 2025-09-01 17:17:41 +01:00 · 2025-09-01 17:17:41 +01:00 · cf91a89dd2
commit cf91a89dd2
parent 39a22dcaac
6 changed files with 14 additions and 23 deletions
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@ -64,9 +64,9 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.

-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is [here](../../vllm/entrypoints/openai/serving_pooling_with_io_plugin.py).
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/io_processor_pooling` serving endpoint is available here <gh-file:vllm/entrypoints/openai/serving_pooling_with_io_plugin.py>.

-An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our [online](../../examples/online_serving/prithvi_geospatial_mae.py) and [offline](../../examples/offline_inference/prithvi_geospatial_mae_io_processor.py) inference examples.
+An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/christian-pinto/prithvi_io_processor_plugin). Please, also refer to our online (<gh-file:examples/online_serving/prithvi_geospatial_mae.py>) and offline (<gh-file:examples/offline_inference/prithvi_geospatial_mae_io_processor.py>) inference examples.

 ## Using an IO Processor plugin

--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@ -33,6 +33,7 @@ def main():
        },
        "priority": 0,
        "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        "softmax": False,
    }

    ret = requests.post(server_endpoint, json=request_payload_url)
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@ -8,7 +8,7 @@ import datetime
 import os
 import tempfile
 import urllib.request
-from collections.abc import AsyncGenerator, Sequence
+from collections.abc import Sequence
 from typing import Any, Optional, Union

 import albumentations
@ -359,14 +359,6 @@ class PrithviMultimodalDataProcessor(IOProcessor):

        return prompts

-    async def pre_process_async(
-        self,
-        prompt: IOProcessorInput,
-        request_id: Optional[str] = None,
-        **kwargs,
-    ) -> Union[PromptType, Sequence[PromptType]]:
-        return self.pre_process(prompt, request_id, **kwargs)
-
    def post_process(
        self,
        model_output: Sequence[PoolingRequestOutput],
@ -421,15 +413,6 @@ class PrithviMultimodalDataProcessor(IOProcessor):
                                  data=out_data,
                                  request_id=request_id)

-    async def post_process_async(
-        self,
-        model_output: AsyncGenerator[tuple[int, PoolingRequestOutput]],
-        request_id: Optional[str] = None,
-        **kwargs,
-    ) -> IOProcessorOutput:
-        collected_output = [item async for i, item in model_output]
-        return self.post_process(collected_output, request_id, **kwargs)
-

 class PrithviMultimodalDataProcessorIndia(PrithviMultimodalDataProcessor):

--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@ -113,6 +113,7 @@ async def test_prithvi_mae_plugin_online(
        },
        "priority": 0,
        "model": model_name,
+        "softmax": False
    }

    ret = requests.post(
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -1424,9 +1424,10 @@ class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
    When using plugins IOProcessor plugins, the actual input is processed
    by the plugin itself. Hence, we use a generic type for the request data
    """
+    softmax: bool = True

    def to_pooling_params(self):
-        return PoolingParams(task="encode")
+        return PoolingParams(task="encode", softmax=self.softmax)


 class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@ -49,7 +49,12 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
        request_id: Optional[str] = None,
        **kwargs,
    ) -> IOProcessorOutput:
-        collected_output = [item async for i, item in model_output]
+        # We cannot guarantee outputs are returned in the same order they were
+        # fed to vLLM.
+        # Let's sort them by id before post_processing
+        sorted_output = sorted([(i, item) async for i, item in model_output],
+                               key=lambda output: output[0])
+        collected_output = [output[1] for output in sorted_output]
        return self.post_process(collected_output, request_id, **kwargs)

    @abstractmethod
@ -59,4 +64,4 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]):
    @abstractmethod
    def output_to_response(
            self, plugin_output: IOProcessorOutput) -> IOProcessorResponse:
-        raise NotImplementedError
+        raise NotImplementedError