diff --git a/tests/entrypoints/openai/test_skip_tokenizer.py b/tests/entrypoints/openai/test_skip_tokenizer.py new file mode 100644 index 0000000000000..32d28277e0ef8 --- /dev/null +++ b/tests/entrypoints/openai/test_skip_tokenizer.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import base64 +import io + +import numpy as np +import pytest +import requests +import torch + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM" +DTYPE = "float16" + + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "embed", + # use half precision for speed and memory savings in CI environment + "--dtype", + DTYPE, + "--enforce-eager", + "--trust-remote-code", + "--skip-tokenizer-init", + "--max-num-seqs", + "32" + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_request(server: RemoteOpenAIServer, model_name: str): + + pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16) + location_coords = torch.full((1, 2), 1.0, dtype=torch.float16) + + buffer_tiff = io.BytesIO() + torch.save(pixel_values, buffer_tiff) + buffer_tiff.seek(0) + binary_data = buffer_tiff.read() + base64_tensor_embedding = base64.b64encode(binary_data).decode('utf-8') + + buffer_coord = io.BytesIO() + torch.save(location_coords, buffer_coord) + buffer_coord.seek(0) + binary_data = buffer_coord.read() + base64_coord_embedding = base64.b64encode(binary_data).decode('utf-8') + + prompt = { + "model": + model_name, + "additional_data": { + "prompt_token_ids": [1] + }, + "encoding_format": + "base64", + "messages": [{ + "role": + "user", + "content": [{ + "type": "image_embeds", + "image_embeds": { + "pixel_values": base64_tensor_embedding, + "location_coords": base64_coord_embedding, + }, + }], + }] + } + + # test single pooling + response = requests.post(server.url_for("pooling"), json=prompt) + response.raise_for_status() + + output = response.json()["data"][0]['data'] + + np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32) + + assert len(np_response) == 524288 diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 67d9a3bf6ce20..cde8fc367fb54 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -97,11 +97,16 @@ class MQLLMEngineClient(EngineClient): self.model_config = engine_config.model_config self.decoding_config = engine_config.decoding_config - # Create the tokenizer group. - self.tokenizer = init_tokenizer_from_configs( - model_config=self.model_config, - scheduler_config=engine_config.scheduler_config, - lora_config=engine_config.lora_config) + if self.vllm_config.model_config.skip_tokenizer_init: + self.tokenizer = None + + else: + # Create the tokenizer group. + self.tokenizer = init_tokenizer_from_configs( + model_config=self.model_config, + scheduler_config=engine_config.scheduler_config, + lora_config=engine_config.lora_config) + self.input_preprocessor = InputPreprocessor(self.model_config, self.tokenizer) @@ -375,7 +380,10 @@ class MQLLMEngineClient(EngineClient): return self.input_preprocessor async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): - return await self.tokenizer.get_lora_tokenizer_async(lora_request) + if self.tokenizer is None: + return None + else: + return await self.tokenizer.get_lora_tokenizer_async(lora_request) async def get_vllm_config(self) -> VllmConfig: return self.vllm_config diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index edc366f9b8a88..9d848679d5d98 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -880,7 +880,10 @@ class OpenAIServing: _chat_template_kwargs.update(chat_template_kwargs or {}) request_prompt: Union[str, list[int]] - if isinstance(tokenizer, MistralTokenizer): + + if tokenizer is None: + request_prompt = "placeholder" + elif isinstance(tokenizer, MistralTokenizer): request_prompt = apply_mistral_chat_template( tokenizer, messages=messages, @@ -910,7 +913,14 @@ class OpenAIServing: request = tool_parser(tokenizer).adjust_request( # type: ignore request=request) - if isinstance(request_prompt, str): + if tokenizer is None: + assert isinstance(request_prompt, str), ( + "Prompt has to be a string", \ + "when the tokenizer is not initialised" + ) + prompt_inputs = TextTokensPrompt(prompt=request_prompt, + prompt_token_ids=[1]) + elif isinstance(request_prompt, str): prompt_inputs = await self._tokenize_prompt_input_async( request, tokenizer, diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 12334cdac365a..38745d001ade6 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -96,7 +96,11 @@ class OpenAIServingPooling(OpenAIServing): self.max_model_len, truncate_prompt_tokens) lora_request = self._maybe_get_adapters(request) - tokenizer = await self.engine_client.get_tokenizer(lora_request) + if self.model_config.skip_tokenizer_init: + tokenizer = None + else: + tokenizer = await self.engine_client.get_tokenizer(lora_request + ) if isinstance(request, PoolingChatRequest): ( diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 0f00fd47fe4fc..304a9e987ee03 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -103,7 +103,10 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor): mm_kwargs = {} for k, v in mm_data.items(): - mm_kwargs[k] = v + if isinstance(v, dict) and k == "image": + mm_kwargs.update(v) + else: + mm_kwargs[k] = v mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} # This model receives in input a multi-dimensional tensor representing