mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-29 19:57:17 +08:00
Add support for Prithvi in Online serving mode (#21518)
Signed-off-by: Michele Gazzetti <michele.gazzetti1@ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
parent
136d750f5f
commit
e189b50f53
93
tests/entrypoints/openai/test_skip_tokenizer.py
Normal file
93
tests/entrypoints/openai/test_skip_tokenizer.py
Normal file
@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import io
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"
|
||||
DTYPE = "float16"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def v1(run_with_both_engines):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
# test in a package
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task",
|
||||
"embed",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
DTYPE,
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--skip-tokenizer-init",
|
||||
"--max-num-seqs",
|
||||
"32"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_single_request(server: RemoteOpenAIServer, model_name: str):
|
||||
|
||||
pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
|
||||
location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
|
||||
|
||||
buffer_tiff = io.BytesIO()
|
||||
torch.save(pixel_values, buffer_tiff)
|
||||
buffer_tiff.seek(0)
|
||||
binary_data = buffer_tiff.read()
|
||||
base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||
|
||||
buffer_coord = io.BytesIO()
|
||||
torch.save(location_coords, buffer_coord)
|
||||
buffer_coord.seek(0)
|
||||
binary_data = buffer_coord.read()
|
||||
base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8')
|
||||
|
||||
prompt = {
|
||||
"model":
|
||||
model_name,
|
||||
"additional_data": {
|
||||
"prompt_token_ids": [1]
|
||||
},
|
||||
"encoding_format":
|
||||
"base64",
|
||||
"messages": [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"pixel_values": base64_tensor_embedding,
|
||||
"location_coords": base64_coord_embedding,
|
||||
},
|
||||
}],
|
||||
}]
|
||||
}
|
||||
|
||||
# test single pooling
|
||||
response = requests.post(server.url_for("pooling"), json=prompt)
|
||||
response.raise_for_status()
|
||||
|
||||
output = response.json()["data"][0]['data']
|
||||
|
||||
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
|
||||
|
||||
assert len(np_response) == 524288
|
||||
@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient):
|
||||
self.model_config = engine_config.model_config
|
||||
self.decoding_config = engine_config.decoding_config
|
||||
|
||||
# Create the tokenizer group.
|
||||
self.tokenizer = init_tokenizer_from_configs(
|
||||
model_config=self.model_config,
|
||||
scheduler_config=engine_config.scheduler_config,
|
||||
lora_config=engine_config.lora_config)
|
||||
if self.vllm_config.model_config.skip_tokenizer_init:
|
||||
self.tokenizer = None
|
||||
|
||||
else:
|
||||
# Create the tokenizer group.
|
||||
self.tokenizer = init_tokenizer_from_configs(
|
||||
model_config=self.model_config,
|
||||
scheduler_config=engine_config.scheduler_config,
|
||||
lora_config=engine_config.lora_config)
|
||||
|
||||
self.input_preprocessor = InputPreprocessor(self.model_config,
|
||||
self.tokenizer)
|
||||
|
||||
@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient):
|
||||
return self.input_preprocessor
|
||||
|
||||
async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
|
||||
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
|
||||
if self.tokenizer is None:
|
||||
return None
|
||||
else:
|
||||
return await self.tokenizer.get_lora_tokenizer_async(lora_request)
|
||||
|
||||
async def get_vllm_config(self) -> VllmConfig:
|
||||
return self.vllm_config
|
||||
|
||||
@ -880,7 +880,10 @@ class OpenAIServing:
|
||||
_chat_template_kwargs.update(chat_template_kwargs or {})
|
||||
|
||||
request_prompt: Union[str, list[int]]
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
|
||||
if tokenizer is None:
|
||||
request_prompt = "placeholder"
|
||||
elif isinstance(tokenizer, MistralTokenizer):
|
||||
request_prompt = apply_mistral_chat_template(
|
||||
tokenizer,
|
||||
messages=messages,
|
||||
@ -910,7 +913,14 @@ class OpenAIServing:
|
||||
request = tool_parser(tokenizer).adjust_request( # type: ignore
|
||||
request=request)
|
||||
|
||||
if isinstance(request_prompt, str):
|
||||
if tokenizer is None:
|
||||
assert isinstance(request_prompt, str), (
|
||||
"Prompt has to be a string", \
|
||||
"when the tokenizer is not initialised"
|
||||
)
|
||||
prompt_inputs = TextTokensPrompt(prompt=request_prompt,
|
||||
prompt_token_ids=[1])
|
||||
elif isinstance(request_prompt, str):
|
||||
prompt_inputs = await self._tokenize_prompt_input_async(
|
||||
request,
|
||||
tokenizer,
|
||||
|
||||
@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
self.max_model_len, truncate_prompt_tokens)
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = await self.engine_client.get_tokenizer(lora_request
|
||||
)
|
||||
|
||||
if isinstance(request, PoolingChatRequest):
|
||||
(
|
||||
|
||||
@ -103,7 +103,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
||||
mm_kwargs = {}
|
||||
|
||||
for k, v in mm_data.items():
|
||||
mm_kwargs[k] = v
|
||||
if isinstance(v, dict) and k == "image":
|
||||
mm_kwargs.update(v)
|
||||
else:
|
||||
mm_kwargs[k] = v
|
||||
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
|
||||
|
||||
# This model receives in input a multi-dimensional tensor representing
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user