Add support for Prithvi in Online serving mode (#21518)

Signed-off-by: Michele Gazzetti <michele.gazzetti1@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
mgazz 2025-07-25 15:01:27 +01:00 committed by GitHub
parent 136d750f5f
commit e189b50f53
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 128 additions and 10 deletions

View File

@ -0,0 +1,93 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import numpy as np
import pytest
import requests
import torch
from ...utils import RemoteOpenAIServer
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
DTYPE = "float16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
"--enforce-eager",
"--trust-remote-code",
"--skip-tokenizer-init",
"--max-num-seqs",
"32"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
buffer_tiff = io.BytesIO()
torch.save(pixel_values, buffer_tiff)
buffer_tiff.seek(0)
binary_data = buffer_tiff.read()
base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
buffer_coord = io.BytesIO()
torch.save(location_coords, buffer_coord)
buffer_coord.seek(0)
binary_data = buffer_coord.read()
base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
prompt = {
"model":
model_name,
"additional_data": {
"prompt_token_ids": [1]
},
"encoding_format":
"base64",
"messages": [{
"role":
"user",
"content": [{
"type": "image_embeds",
"image_embeds": {
"pixel_values": base64_tensor_embedding,
"location_coords": base64_coord_embedding,
},
}],
}]
}
# test single pooling
response = requests.post(server.url_for("pooling"), json=prompt)
response.raise_for_status()
output = response.json()["data"][0]['data']
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
assert len(np_response) == 524288

View File

@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient):
self.model_config = engine_config.model_config
self.decoding_config = engine_config.decoding_config
# Create the tokenizer group.
self.tokenizer = init_tokenizer_from_configs(
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config,
lora_config=engine_config.lora_config)
if self.vllm_config.model_config.skip_tokenizer_init:
self.tokenizer = None
else:
# Create the tokenizer group.
self.tokenizer = init_tokenizer_from_configs(
model_config=self.model_config,
scheduler_config=engine_config.scheduler_config,
lora_config=engine_config.lora_config)
self.input_preprocessor = InputPreprocessor(self.model_config,
self.tokenizer)
@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient):
return self.input_preprocessor
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
if self.tokenizer is None:
return None
else:
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
async def get_vllm_config(self) -> VllmConfig:
return self.vllm_config

View File

@ -880,7 +880,10 @@ class OpenAIServing:
_chat_template_kwargs.update(chat_template_kwargs or {})
request_prompt: Union[str, list[int]]
if isinstance(tokenizer, MistralTokenizer):
if tokenizer is None:
request_prompt = "placeholder"
elif isinstance(tokenizer, MistralTokenizer):
request_prompt = apply_mistral_chat_template(
tokenizer,
messages=messages,
@ -910,7 +913,14 @@ class OpenAIServing:
request = tool_parser(tokenizer).adjust_request( # type: ignore
request=request)
if isinstance(request_prompt, str):
if tokenizer is None:
assert isinstance(request_prompt, str), (
"Prompt has to be a string", \
"when the tokenizer is not initialised"
)
prompt_inputs = TextTokensPrompt(prompt=request_prompt,
prompt_token_ids=[1])
elif isinstance(request_prompt, str):
prompt_inputs = await self._tokenize_prompt_input_async(
request,
tokenizer,

View File

@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing):
self.max_model_len, truncate_prompt_tokens)
lora_request = self._maybe_get_adapters(request)
tokenizer = await self.engine_client.get_tokenizer(lora_request)
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = await self.engine_client.get_tokenizer(lora_request
)
if isinstance(request, PoolingChatRequest):
(

View File

@ -103,7 +103,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
mm_kwargs = {}
for k, v in mm_data.items():
mm_kwargs[k] = v
if isinstance(v, dict) and k == "image":
mm_kwargs.update(v)
else:
mm_kwargs[k] = v
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
# This model receives in input a multi-dimensional tensor representing