mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-28 17:37:08 +08:00
Merge branch 'main' into v1-blocktable-opt
This commit is contained in:
commit
ff5b1033dc
@ -319,7 +319,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
|||||||
- ✅︎
|
- ✅︎
|
||||||
* - :code:`Qwen2ForCausalLM`
|
* - :code:`Qwen2ForCausalLM`
|
||||||
- Qwen2
|
- Qwen2
|
||||||
- :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
|
- :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
|
||||||
- ✅︎
|
- ✅︎
|
||||||
- ✅︎
|
- ✅︎
|
||||||
* - :code:`Qwen2MoeForCausalLM`
|
* - :code:`Qwen2MoeForCausalLM`
|
||||||
@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
|
|||||||
* - :code:`Qwen2VLForConditionalGeneration`
|
* - :code:`Qwen2VLForConditionalGeneration`
|
||||||
- Qwen2-VL
|
- Qwen2-VL
|
||||||
- T + I\ :sup:`E+` + V\ :sup:`E+`
|
- T + I\ :sup:`E+` + V\ :sup:`E+`
|
||||||
- :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
|
- :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
|
||||||
- ✅︎
|
- ✅︎
|
||||||
- ✅︎
|
- ✅︎
|
||||||
-
|
-
|
||||||
|
|||||||
@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
|
|||||||
|
|
||||||
### Video
|
### Video
|
||||||
|
|
||||||
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
|
Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
|
||||||
|
|
||||||
You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
|
First, launch the OpenAI-compatible server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, you can use the OpenAI client as follows:
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
openai_api_key = "EMPTY"
|
||||||
|
openai_api_base = "http://localhost:8000/v1"
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key=openai_api_key,
|
||||||
|
base_url=openai_api_base,
|
||||||
|
)
|
||||||
|
|
||||||
|
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||||
|
|
||||||
|
## Use video url in the payload
|
||||||
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this video?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": {
|
||||||
|
"url": video_url
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
|
print("Chat completion output from image url:", result)
|
||||||
|
```
|
||||||
|
|
||||||
|
Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
|
||||||
|
|
||||||
````{note}
|
````{note}
|
||||||
By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
|
By default, the timeout for fetching videos through HTTP URL is `30` seconds.
|
||||||
You can override this by setting the environment variable:
|
You can override this by setting the environment variable:
|
||||||
|
|
||||||
```console
|
```console
|
||||||
|
|||||||
@ -18,7 +18,6 @@ import base64
|
|||||||
import requests
|
import requests
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
from vllm.assets.audio import AudioAsset
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||||
@ -151,8 +150,66 @@ def run_multi_image() -> None:
|
|||||||
print("Chat completion output:", result)
|
print("Chat completion output:", result)
|
||||||
|
|
||||||
|
|
||||||
|
# Video input inference
|
||||||
|
def run_video() -> None:
|
||||||
|
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
|
||||||
|
video_base64 = encode_base64_content_from_url(video_url)
|
||||||
|
|
||||||
|
## Use video url in the payload
|
||||||
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this video?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": {
|
||||||
|
"url": video_url
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_url.choices[0].message.content
|
||||||
|
print("Chat completion output from image url:", result)
|
||||||
|
|
||||||
|
## Use base64 encoded video in the payload
|
||||||
|
chat_completion_from_base64 = client.chat.completions.create(
|
||||||
|
messages=[{
|
||||||
|
"role":
|
||||||
|
"user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "What's in this video?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "video_url",
|
||||||
|
"video_url": {
|
||||||
|
"url": f"data:video/mp4;base64,{video_base64}"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}],
|
||||||
|
model=model,
|
||||||
|
max_completion_tokens=64,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = chat_completion_from_base64.choices[0].message.content
|
||||||
|
print("Chat completion output from base64 encoded image:", result)
|
||||||
|
|
||||||
|
|
||||||
# Audio input inference
|
# Audio input inference
|
||||||
def run_audio() -> None:
|
def run_audio() -> None:
|
||||||
|
from vllm.assets.audio import AudioAsset
|
||||||
|
|
||||||
audio_url = AudioAsset("winning_call").url
|
audio_url = AudioAsset("winning_call").url
|
||||||
audio_base64 = encode_base64_content_from_url(audio_url)
|
audio_base64 = encode_base64_content_from_url(audio_url)
|
||||||
|
|
||||||
@ -240,6 +297,7 @@ example_function_map = {
|
|||||||
"text-only": run_text_only,
|
"text-only": run_text_only,
|
||||||
"single-image": run_single_image,
|
"single-image": run_single_image,
|
||||||
"multi-image": run_multi_image,
|
"multi-image": run_multi_image,
|
||||||
|
"video": run_video,
|
||||||
"audio": run_audio,
|
"audio": run_audio,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,12 +311,11 @@ if __name__ == "__main__":
|
|||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Demo on using OpenAI client for online inference with '
|
description='Demo on using OpenAI client for online inference with '
|
||||||
'multimodal language models served with vLLM.')
|
'multimodal language models served with vLLM.')
|
||||||
parser.add_argument(
|
parser.add_argument('--chat-type',
|
||||||
'--chat-type',
|
'-c',
|
||||||
'-c',
|
type=str,
|
||||||
type=str,
|
default="single-image",
|
||||||
default="single-image",
|
choices=list(example_function_map.keys()),
|
||||||
choices=["text-only", "single-image", "multi-image", "audio"],
|
help='Conversation type with multimodal data.')
|
||||||
help='Conversation type with multimodal data.')
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -74,7 +74,7 @@ def test_load_checkpoints(
|
|||||||
embedding_padding_modules=embed_padding_modules)
|
embedding_padding_modules=embed_padding_modules)
|
||||||
|
|
||||||
|
|
||||||
def test_lora_weights_mapping(baichuan_lora_files, ):
|
def test_lora_weights_mapping(baichuan_lora_files):
|
||||||
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
|
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
|
||||||
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
|
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
|
||||||
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
|
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
|
||||||
@ -86,10 +86,14 @@ def test_lora_weights_mapping(baichuan_lora_files, ):
|
|||||||
else:
|
else:
|
||||||
expected_lora_modules.append(module)
|
expected_lora_modules.append(module)
|
||||||
|
|
||||||
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
|
hf_to_vllm_mapper = WeightsMapper(
|
||||||
"model.": "language_model.model.",
|
orig_to_new_prefix={
|
||||||
}, )
|
"model.": "language_model.model.",
|
||||||
|
},
|
||||||
|
orig_to_new_substr={
|
||||||
|
".layers.": ".baichuan_layers.",
|
||||||
|
},
|
||||||
|
)
|
||||||
lora_model = LoRAModel.from_local_checkpoint(
|
lora_model = LoRAModel.from_local_checkpoint(
|
||||||
baichuan_lora_files,
|
baichuan_lora_files,
|
||||||
expected_lora_modules,
|
expected_lora_modules,
|
||||||
@ -101,3 +105,4 @@ def test_lora_weights_mapping(baichuan_lora_files, ):
|
|||||||
)
|
)
|
||||||
for name in lora_model.loras:
|
for name in lora_model.loras:
|
||||||
assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
|
assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
|
||||||
|
assert ".baichuan_layers." in name
|
||||||
|
|||||||
@ -22,7 +22,7 @@ IMAGE_ASSETS = [
|
|||||||
|
|
||||||
# After fine-tuning with LoRA, all generated content should start begin `A`.
|
# After fine-tuning with LoRA, all generated content should start begin `A`.
|
||||||
EXPECTED_OUTPUT = [
|
EXPECTED_OUTPUT = [
|
||||||
"A stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501
|
"A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501
|
||||||
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
|
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -76,3 +76,7 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
|
|||||||
output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
|
output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
|
||||||
for i in range(len(EXPECTED_OUTPUT)):
|
for i in range(len(EXPECTED_OUTPUT)):
|
||||||
assert EXPECTED_OUTPUT[i].startswith(output1[i])
|
assert EXPECTED_OUTPUT[i].startswith(output1[i])
|
||||||
|
|
||||||
|
output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
|
||||||
|
for i in range(len(EXPECTED_OUTPUT)):
|
||||||
|
assert EXPECTED_OUTPUT[i].startswith(output2[i])
|
||||||
|
|||||||
@ -3,7 +3,7 @@ from typing import List, Optional, Type
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.image import rescale_image_size
|
||||||
|
|
||||||
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
|
from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
|
||||||
from ...utils import check_logprobs_close
|
from ...utils import check_logprobs_close
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from transformers import AutoConfig
|
|||||||
# Import the functions to test
|
# Import the functions to test
|
||||||
from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
|
from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
|
||||||
image_to_pixel_values_wrapper)
|
image_to_pixel_values_wrapper)
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.image import rescale_image_size
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
|
"h2oai/h2ovl-mississippi-800m", # Replace with your actual model names
|
||||||
|
|||||||
@ -5,7 +5,7 @@ from typing import List, Optional, Tuple, Type
|
|||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.image import rescale_image_size
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
|
|
||||||
|
|||||||
@ -6,8 +6,8 @@ import torch
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from vllm.entrypoints.llm import LLM
|
from vllm.entrypoints.llm import LLM
|
||||||
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
from vllm.multimodal.image import rescale_image_size
|
||||||
sample_frames_from_video)
|
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
|
||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
|
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
|
||||||
PromptVideoInput, VllmRunner)
|
PromptVideoInput, VllmRunner)
|
||||||
|
|||||||
@ -5,8 +5,9 @@ from typing import Callable, Iterable, List, Optional, Tuple, Union
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
from vllm.multimodal.image import rescale_image_size
|
||||||
resize_video, sample_frames_from_video)
|
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||||
|
sample_frames_from_video)
|
||||||
|
|
||||||
from .....conftest import _ImageAssets, _VideoAssets
|
from .....conftest import _ImageAssets, _VideoAssets
|
||||||
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
|
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
|
||||||
|
|||||||
@ -1,8 +1,9 @@
|
|||||||
"""Custom input builders for edge-cases in different models."""
|
"""Custom input builders for edge-cases in different models."""
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
|
||||||
from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
|
from vllm.multimodal.image import rescale_image_size
|
||||||
resize_video, sample_frames_from_video)
|
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||||
|
sample_frames_from_video)
|
||||||
|
|
||||||
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||||
from .builders import build_multi_image_inputs, build_single_image_inputs
|
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
|
|||||||
|
|
||||||
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
|
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
|
||||||
global_force_attn_backend_context_manager)
|
global_force_attn_backend_context_manager)
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.image import rescale_image_size
|
||||||
from vllm.sequence import SampleLogprobs
|
from vllm.sequence import SampleLogprobs
|
||||||
|
|
||||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from transformers import LlavaNextImageProcessor
|
|||||||
|
|
||||||
from vllm.config import ModelConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.multimodal import MultiModalRegistry
|
from vllm.multimodal import MultiModalRegistry
|
||||||
from vllm.multimodal.utils import rescale_image_size
|
from vllm.multimodal.image import rescale_image_size
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|||||||
@ -9,7 +9,6 @@ import openai
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from tensorizer import EncryptionParams
|
|
||||||
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
@ -23,12 +22,18 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
|
|||||||
serialize_vllm_model,
|
serialize_vllm_model,
|
||||||
tensorize_vllm_model)
|
tensorize_vllm_model)
|
||||||
# yapf: enable
|
# yapf: enable
|
||||||
from vllm.utils import import_from_path
|
from vllm.utils import PlaceholderModule, import_from_path
|
||||||
|
|
||||||
from ..conftest import VllmRunner
|
from ..conftest import VllmRunner
|
||||||
from ..utils import VLLM_PATH, RemoteOpenAIServer
|
from ..utils import VLLM_PATH, RemoteOpenAIServer
|
||||||
from .conftest import retry_until_skip
|
from .conftest import retry_until_skip
|
||||||
|
|
||||||
|
try:
|
||||||
|
from tensorizer import EncryptionParams
|
||||||
|
except ImportError:
|
||||||
|
tensorizer = PlaceholderModule("tensorizer") # type: ignore[assignment]
|
||||||
|
EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
|
||||||
|
|
||||||
EXAMPLES_PATH = VLLM_PATH / "examples"
|
EXAMPLES_PATH = VLLM_PATH / "examples"
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
|
|||||||
@ -139,3 +139,41 @@ def test_engine_core(monkeypatch):
|
|||||||
engine_core.abort_requests([req2.request_id, req0.request_id])
|
engine_core.abort_requests([req2.request_id, req0.request_id])
|
||||||
assert len(engine_core.scheduler.waiting) == 0
|
assert len(engine_core.scheduler.waiting) == 0
|
||||||
assert len(engine_core.scheduler.running) == 0
|
assert len(engine_core.scheduler.running) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_core_advanced_sampling(monkeypatch):
|
||||||
|
"""
|
||||||
|
A basic end-to-end test to verify that the engine functions correctly
|
||||||
|
when additional sampling parameters, such as min_tokens and
|
||||||
|
presence_penalty, are set.
|
||||||
|
"""
|
||||||
|
with monkeypatch.context() as m:
|
||||||
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
"""Setup the EngineCore."""
|
||||||
|
engine_args = EngineArgs(model=MODEL_NAME)
|
||||||
|
vllm_config = engine_args.create_engine_config(
|
||||||
|
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||||
|
executor_class = AsyncLLM._get_executor_cls(vllm_config)
|
||||||
|
|
||||||
|
engine_core = EngineCore(vllm_config=vllm_config,
|
||||||
|
executor_class=executor_class,
|
||||||
|
usage_context=UsageContext.UNKNOWN_CONTEXT)
|
||||||
|
"""Test basic request lifecycle."""
|
||||||
|
# First request.
|
||||||
|
request: EngineCoreRequest = make_request()
|
||||||
|
request.sampling_params = SamplingParams(
|
||||||
|
min_tokens=4,
|
||||||
|
presence_penalty=1.0,
|
||||||
|
frequency_penalty=1.0,
|
||||||
|
repetition_penalty=0.1,
|
||||||
|
stop_token_ids=[1001, 1002],
|
||||||
|
)
|
||||||
|
engine_core.add_request(request)
|
||||||
|
assert len(engine_core.scheduler.waiting) == 1
|
||||||
|
assert len(engine_core.scheduler.running) == 0
|
||||||
|
# Loop through until they are all done.
|
||||||
|
while len(engine_core.step()) > 0:
|
||||||
|
pass
|
||||||
|
|
||||||
|
assert len(engine_core.scheduler.waiting) == 0
|
||||||
|
assert len(engine_core.scheduler.running) == 0
|
||||||
|
|||||||
0
tests/v1/sample/__init__.py
Normal file
0
tests/v1/sample/__init__.py
Normal file
331
tests/v1/sample/test_sampler.py
Normal file
331
tests/v1/sample/test_sampler.py
Normal file
@ -0,0 +1,331 @@
|
|||||||
|
from typing import List, Set, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.utils import make_tensor_with_pad
|
||||||
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
|
from vllm.v1.sample.sampler import Sampler
|
||||||
|
|
||||||
|
VOCAB_SIZE = 1024
|
||||||
|
NUM_OUTPUT_TOKENS = 20
|
||||||
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
MAX_NUM_PROMPT_TOKENS = 64
|
||||||
|
|
||||||
|
|
||||||
|
def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
|
||||||
|
fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
|
||||||
|
return fake_logits
|
||||||
|
|
||||||
|
|
||||||
|
def _create_penalty_tensor(batch_size: int, penalty_value: float,
|
||||||
|
device: torch.device) -> torch.Tensor:
|
||||||
|
return torch.full((batch_size, ),
|
||||||
|
fill_value=penalty_value,
|
||||||
|
dtype=torch.float,
|
||||||
|
device=device)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_prompt_tokens_tensor(
|
||||||
|
prompt_token_ids: List[List[int]],
|
||||||
|
vocab_size: int,
|
||||||
|
device: torch.device,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
return make_tensor_with_pad(
|
||||||
|
prompt_token_ids,
|
||||||
|
pad=vocab_size,
|
||||||
|
device=device,
|
||||||
|
dtype=torch.int64,
|
||||||
|
pin_memory=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_default_sampling_metadata(
|
||||||
|
num_output_tokens: int,
|
||||||
|
batch_size: int,
|
||||||
|
vocab_size: int,
|
||||||
|
device: torch.device,
|
||||||
|
) -> SamplingMetadata:
|
||||||
|
output_token_ids: List[List[int]] = []
|
||||||
|
prompt_token_ids: List[List[int]] = []
|
||||||
|
for _ in range(batch_size):
|
||||||
|
output_token_ids.append(
|
||||||
|
np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
|
||||||
|
prompt_token_ids.append(
|
||||||
|
np.random.randint(0,
|
||||||
|
vocab_size,
|
||||||
|
size=np.random.randint(
|
||||||
|
1, MAX_NUM_PROMPT_TOKENS)).tolist())
|
||||||
|
fake_sampling_metadata = SamplingMetadata(
|
||||||
|
temperature=torch.full((batch_size, ), 0.0),
|
||||||
|
all_greedy=True,
|
||||||
|
all_random=False,
|
||||||
|
top_p=torch.empty(batch_size, ),
|
||||||
|
top_k=torch.empty(batch_size, ),
|
||||||
|
no_top_p=True,
|
||||||
|
no_top_k=True,
|
||||||
|
generators={},
|
||||||
|
max_num_logprobs=VOCAB_SIZE,
|
||||||
|
prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
|
||||||
|
vocab_size, device),
|
||||||
|
output_token_ids=output_token_ids,
|
||||||
|
frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
|
||||||
|
presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
|
||||||
|
repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
|
||||||
|
no_penalties=True,
|
||||||
|
min_tokens=[],
|
||||||
|
stop_token_ids=[],
|
||||||
|
)
|
||||||
|
return fake_sampling_metadata
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_min_token_penalties_and_stop_tokens(
|
||||||
|
num_output_tokens: int, batch_size: int, vocab_size: int,
|
||||||
|
batch_indices_for_min_token_penalty: List[int]
|
||||||
|
) -> Tuple[List[int], List[Set[int]]]:
|
||||||
|
"""
|
||||||
|
Generates and returns a list of minimum token penalties (`min_tokens`)
|
||||||
|
and a corresponding list of stop token IDs (`stop_token_ids`) for each
|
||||||
|
batch.
|
||||||
|
|
||||||
|
If a batch index is included in `batch_indices_for_min_token_penalty`,
|
||||||
|
a higher `min_tokens` value is assigned (within a randomized range),
|
||||||
|
and a random set of stop token IDs is created. Otherwise, a lower
|
||||||
|
`min_tokens` value is assigned, and the stop token IDs set is empty.
|
||||||
|
"""
|
||||||
|
stop_token_ids: List[Set[int]] = []
|
||||||
|
min_tokens: List[int] = []
|
||||||
|
for index in range(batch_size):
|
||||||
|
if index in batch_indices_for_min_token_penalty:
|
||||||
|
min_tokens.append(
|
||||||
|
np.random.randint(num_output_tokens + 1,
|
||||||
|
2 * num_output_tokens))
|
||||||
|
stop_token_ids.append(
|
||||||
|
set(
|
||||||
|
np.random.randint(0, vocab_size - 1)
|
||||||
|
for _ in range(np.random.randint(0, vocab_size))))
|
||||||
|
|
||||||
|
else:
|
||||||
|
min_tokens.append(np.random.randint(0, num_output_tokens))
|
||||||
|
stop_token_ids.append(set())
|
||||||
|
return (min_tokens, stop_token_ids)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_weighted_output_token_list(
|
||||||
|
batch_size: int,
|
||||||
|
vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]:
|
||||||
|
"""
|
||||||
|
Creates an output token list where each token occurs a distinct
|
||||||
|
number of times.
|
||||||
|
|
||||||
|
For each batch, a random subset of token IDs is selected from the
|
||||||
|
vocabulary. The selected tokens are then added to the output token
|
||||||
|
list, each with a different frequency.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[List[List[int]], List[List[int]]]:
|
||||||
|
- The first element is the output token list, where each sublist
|
||||||
|
corresponds to a batch and contains tokens with weighted
|
||||||
|
frequencies.
|
||||||
|
- The second element is a list of distinct token IDs for each
|
||||||
|
batch, ordered by their frequency in the corresponding output
|
||||||
|
list.
|
||||||
|
"""
|
||||||
|
output_token_ids: List[List[int]] = []
|
||||||
|
sorted_token_ids_in_output: List[List[int]] = []
|
||||||
|
for _ in range(batch_size):
|
||||||
|
distinct_token_ids = np.random.choice(vocab_size,
|
||||||
|
size=np.random.randint(1, 10),
|
||||||
|
replace=False).tolist()
|
||||||
|
sorted_token_ids_in_output.append(distinct_token_ids)
|
||||||
|
output_token_ids_for_batch = []
|
||||||
|
for index, token_id in enumerate(distinct_token_ids):
|
||||||
|
output_token_ids_for_batch.extend(
|
||||||
|
[token_id for _ in range(index + 1)])
|
||||||
|
output_token_ids.append(output_token_ids_for_batch)
|
||||||
|
return (output_token_ids, sorted_token_ids_in_output)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
@pytest.mark.parametrize("batch_size", [1, 2, 32])
|
||||||
|
def test_sampler_min_tokens_penalty(device: str, batch_size: int):
|
||||||
|
"""
|
||||||
|
Tests that if the number of output tokens is less than
|
||||||
|
SamplingParams.min_tokens then we will set the logits for
|
||||||
|
the stop token ids to -inf.
|
||||||
|
"""
|
||||||
|
torch.set_default_device(device)
|
||||||
|
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
|
||||||
|
sampling_metadata = _create_default_sampling_metadata(
|
||||||
|
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
|
||||||
|
batch_indices_for_min_token_penalty = np.random.randint(
|
||||||
|
0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
|
||||||
|
min_tokens, stop_token_ids = _generate_min_token_penalties_and_stop_tokens(
|
||||||
|
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
|
||||||
|
batch_indices_for_min_token_penalty)
|
||||||
|
sampling_metadata.min_tokens = min_tokens
|
||||||
|
sampling_metadata.stop_token_ids = stop_token_ids
|
||||||
|
sampler = Sampler()
|
||||||
|
sampler_output = sampler(fake_logits, sampling_metadata)
|
||||||
|
for batch_idx in range(batch_size):
|
||||||
|
for vocab in range(VOCAB_SIZE):
|
||||||
|
# Verify that the logprobs for stop token ids is set
|
||||||
|
# to -inf.
|
||||||
|
logprob_index = torch.where(
|
||||||
|
sampler_output.logprob_token_ids[batch_idx] ==
|
||||||
|
vocab)[0].item()
|
||||||
|
if vocab in stop_token_ids[batch_idx]:
|
||||||
|
assert sampler_output.logprobs[batch_idx][
|
||||||
|
logprob_index] == -float("inf")
|
||||||
|
else:
|
||||||
|
assert sampler_output.logprobs[batch_idx][
|
||||||
|
logprob_index] != -float("inf")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
@pytest.mark.parametrize("batch_size", [1, 2, 32])
|
||||||
|
@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
|
||||||
|
def test_sampler_presence_penalty(device: str, batch_size: int,
|
||||||
|
presence_penalty: float):
|
||||||
|
"""
|
||||||
|
Test to verify that if presence penalty is enabled then tokens
|
||||||
|
are penalized as per their presence in the existing output.
|
||||||
|
"""
|
||||||
|
torch.set_default_device(device)
|
||||||
|
# Create fake logits where each token is assigned the same
|
||||||
|
# logit value.
|
||||||
|
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
|
||||||
|
sampling_metadata = _create_default_sampling_metadata(
|
||||||
|
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
|
||||||
|
output_token_ids = sampling_metadata.output_token_ids
|
||||||
|
sampling_metadata.presence_penalties = _create_penalty_tensor(
|
||||||
|
batch_size, presence_penalty, torch.device(device))
|
||||||
|
sampling_metadata.no_penalties = False
|
||||||
|
sampler = Sampler()
|
||||||
|
sampler_output = sampler(fake_logits, sampling_metadata)
|
||||||
|
for batch_idx in range(batch_size):
|
||||||
|
# The logprobs in the SamplerOutput are arranged in descending order.
|
||||||
|
# Since all tokens initially have the same logprobs, the non-penalized
|
||||||
|
# tokens will appear at the beginning, while the penalized tokens
|
||||||
|
# will appear at the end of the list.
|
||||||
|
penalized_token_id = sampler_output.logprob_token_ids[batch_idx][
|
||||||
|
VOCAB_SIZE - 1]
|
||||||
|
penalized_log_prod = sampler_output.logprobs[batch_idx][VOCAB_SIZE - 1]
|
||||||
|
non_penalized_token_id = sampler_output.logprob_token_ids[batch_idx][0]
|
||||||
|
non_penalized_log_prod = sampler_output.logprobs[batch_idx][0]
|
||||||
|
assert non_penalized_log_prod > penalized_log_prod
|
||||||
|
if presence_penalty > 0:
|
||||||
|
# If `presence_penalty` is set to a value greater than 0, it
|
||||||
|
# indicates a preference for new tokens over those already
|
||||||
|
# present in the output.
|
||||||
|
# Verify that the penalized token ID exists in the output, while the
|
||||||
|
# non-penalized token ID does not.
|
||||||
|
assert penalized_token_id in output_token_ids[batch_idx]
|
||||||
|
assert non_penalized_token_id not in output_token_ids[batch_idx]
|
||||||
|
elif presence_penalty < 0:
|
||||||
|
# If `presence_penalty` is set to a value less than 0, it indicates
|
||||||
|
# a preference for existing tokens over new ones. Verify that the
|
||||||
|
# non-penalized token ID exists in the output, while the penalized
|
||||||
|
# token ID does not.
|
||||||
|
assert non_penalized_token_id in output_token_ids[batch_idx]
|
||||||
|
assert penalized_token_id not in output_token_ids[batch_idx]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
@pytest.mark.parametrize("batch_size", [1, 2, 32])
|
||||||
|
@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
|
||||||
|
def test_sampler_frequency_penalty(device: str, batch_size: int,
|
||||||
|
frequency_penalty: float):
|
||||||
|
"""
|
||||||
|
Test to verify that if frequency penalty is enabled then tokens are
|
||||||
|
penalized as per their frequency of occurrence.
|
||||||
|
"""
|
||||||
|
torch.set_default_device(device)
|
||||||
|
# Create fake logits where each token is assigned the same
|
||||||
|
# logit value.
|
||||||
|
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
|
||||||
|
sampling_metadata = _create_default_sampling_metadata(
|
||||||
|
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
|
||||||
|
sampling_metadata.frequency_penalties = _create_penalty_tensor(
|
||||||
|
batch_size, frequency_penalty, torch.device(device))
|
||||||
|
output_token_ids, sorted_token_ids_in_output = \
|
||||||
|
_create_weighted_output_token_list(batch_size, VOCAB_SIZE)
|
||||||
|
sampling_metadata.output_token_ids = output_token_ids
|
||||||
|
sampling_metadata.no_penalties = False
|
||||||
|
sampler = Sampler()
|
||||||
|
sampler_output = sampler(fake_logits, sampling_metadata)
|
||||||
|
for batch_idx in range(batch_size):
|
||||||
|
logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx]
|
||||||
|
non_penalized_token_id = logprobs_token_ids[0]
|
||||||
|
penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1]
|
||||||
|
distinct_sorted_token_ids_in_output = \
|
||||||
|
sorted_token_ids_in_output[batch_idx]
|
||||||
|
most_frequent_token_id = distinct_sorted_token_ids_in_output[
|
||||||
|
len(distinct_sorted_token_ids_in_output) - 1]
|
||||||
|
if frequency_penalty > 0:
|
||||||
|
# If `frequency_penalty` is set to > 0, it indicates
|
||||||
|
# a preference for new tokens over existing ones. Verify that the
|
||||||
|
# non-penalized token ID is not present in the output, while the
|
||||||
|
# most penalized token is the one that occurs most frequently in
|
||||||
|
# the output.
|
||||||
|
assert non_penalized_token_id \
|
||||||
|
not in distinct_sorted_token_ids_in_output
|
||||||
|
assert penalized_token_id == most_frequent_token_id
|
||||||
|
elif frequency_penalty < 0:
|
||||||
|
# If `frequency_penalty` is set to < 0, it indicates
|
||||||
|
# a preference for existing tokens over new ones. Verify that the
|
||||||
|
# non-penalized token ID is the one that occurs most frequently
|
||||||
|
# in the output, while the penalized token ID is one that has not
|
||||||
|
# yet appeared.
|
||||||
|
assert non_penalized_token_id == most_frequent_token_id
|
||||||
|
assert penalized_token_id \
|
||||||
|
not in distinct_sorted_token_ids_in_output
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
@pytest.mark.parametrize("batch_size", [1, 2, 32])
|
||||||
|
@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
|
||||||
|
def test_sampler_repetition_penalty(device: str, batch_size: int,
|
||||||
|
repetition_penalty: float):
|
||||||
|
"""
|
||||||
|
Test to verify that when the repetition penalty is enabled, tokens
|
||||||
|
are penalized based on their presence in the prompt or the existing
|
||||||
|
output.
|
||||||
|
"""
|
||||||
|
torch.set_default_device(device)
|
||||||
|
# Create fake logits where each token is assigned the same
|
||||||
|
# logit value.
|
||||||
|
fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
|
||||||
|
sampling_metadata = _create_default_sampling_metadata(
|
||||||
|
NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
|
||||||
|
sampling_metadata.repetition_penalties = _create_penalty_tensor(
|
||||||
|
batch_size, repetition_penalty, torch.device(device))
|
||||||
|
sampling_metadata.no_penalties = False
|
||||||
|
sampler = Sampler()
|
||||||
|
sampler_output = sampler(fake_logits, sampling_metadata)
|
||||||
|
for batch_idx in range(batch_size):
|
||||||
|
logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx]
|
||||||
|
non_penalized_token_id = logprobs_token_ids[0]
|
||||||
|
penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1]
|
||||||
|
prompt_tokens = sampling_metadata.prompt_token_ids[
|
||||||
|
batch_idx][:].tolist()
|
||||||
|
output_tokens = sampling_metadata.output_token_ids[batch_idx]
|
||||||
|
if repetition_penalty > 1.0:
|
||||||
|
# If `repetition_penalty` > 1.0, verify that the non-penalized
|
||||||
|
# token ID has not been seen before, while the penalized token ID
|
||||||
|
# exists either in the prompt or the output.
|
||||||
|
assert (non_penalized_token_id not in prompt_tokens and \
|
||||||
|
non_penalized_token_id not in output_tokens)
|
||||||
|
assert (penalized_token_id in prompt_tokens or \
|
||||||
|
penalized_token_id in output_tokens)
|
||||||
|
elif repetition_penalty < 1.0:
|
||||||
|
# If `repetition_penalty` < 1.0, verify that the penalized
|
||||||
|
# token ID has not been seen before, while the non-penalized
|
||||||
|
# token ID exists either in the prompt or the output.
|
||||||
|
assert (penalized_token_id not in prompt_tokens and \
|
||||||
|
penalized_token_id not in output_tokens)
|
||||||
|
assert (non_penalized_token_id in prompt_tokens or \
|
||||||
|
non_penalized_token_id in output_tokens)
|
||||||
0
tests/v1/worker/__init__.py
Normal file
0
tests/v1/worker/__init__.py
Normal file
224
tests/v1/worker/test_gpu_input_batch.py
Normal file
224
tests/v1/worker/test_gpu_input_batch.py
Normal file
@ -0,0 +1,224 @@
|
|||||||
|
from typing import Dict, List, Set, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.sampling_params import SamplingParams
|
||||||
|
from vllm.utils import is_pin_memory_available, make_tensor_with_pad
|
||||||
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
|
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||||
|
|
||||||
|
VOCAB_SIZE = 1024
|
||||||
|
NUM_OUTPUT_TOKENS = 20
|
||||||
|
MAX_PROMPT_SIZE = 100
|
||||||
|
CUDA_DEVICES = [
|
||||||
|
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
|
||||||
|
]
|
||||||
|
MAX_NUM_PROMPT_TOKENS = 64
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_requests(
|
||||||
|
input_batch: InputBatch, batch_size: int,
|
||||||
|
reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]:
|
||||||
|
"""
|
||||||
|
Remove some requests randomly from the batch and returns a Tuple
|
||||||
|
of 1) set of request removed 2) indices of the requests removed
|
||||||
|
ordered in descending order
|
||||||
|
"""
|
||||||
|
|
||||||
|
num_reqs_to_remove = np.random.randint(0, batch_size)
|
||||||
|
req_indices_to_remove: Set[int] = set()
|
||||||
|
for _ in range(num_reqs_to_remove):
|
||||||
|
req_index_to_remove = np.random.randint(0, batch_size)
|
||||||
|
req_indices_to_remove.add(req_index_to_remove)
|
||||||
|
|
||||||
|
req_indices_to_remove_list = list(req_indices_to_remove)
|
||||||
|
req_indices_to_remove_list.sort(reverse=True)
|
||||||
|
req_ids_to_remove: Set[str] = set()
|
||||||
|
for index in req_indices_to_remove:
|
||||||
|
input_batch.remove_request(reqs[index].req_id)
|
||||||
|
req_ids_to_remove.add(reqs[index].req_id)
|
||||||
|
return (req_ids_to_remove, req_indices_to_remove_list)
|
||||||
|
|
||||||
|
|
||||||
|
def _construct_expected_sampling_metadata(
|
||||||
|
reqs: List[CachedRequestState], req_ids_retained: Set[int],
|
||||||
|
req_id_index_in_input_batch: Dict[str, int],
|
||||||
|
device: torch.device) -> SamplingMetadata:
|
||||||
|
"""
|
||||||
|
Constructs and returns the expected SamplingMetadata for this
|
||||||
|
batch.
|
||||||
|
"""
|
||||||
|
num_reqs = len(req_ids_retained)
|
||||||
|
output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
|
||||||
|
prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
|
||||||
|
presence_penalties = [0.0 for _ in range(num_reqs)]
|
||||||
|
frequency_penalties = [0.0 for _ in range(num_reqs)]
|
||||||
|
repetition_penalties = [1.0 for _ in range(num_reqs)]
|
||||||
|
top_k = [0 for _ in range(num_reqs)]
|
||||||
|
top_p = [0.0 for _ in range(num_reqs)]
|
||||||
|
temperature = [0.0 for _ in range(num_reqs)]
|
||||||
|
stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)]
|
||||||
|
min_tokens = [0 for _ in range(num_reqs)]
|
||||||
|
for req in reqs:
|
||||||
|
if req.req_id not in req_ids_retained:
|
||||||
|
continue
|
||||||
|
index_in_input_batch = req_id_index_in_input_batch[req.req_id]
|
||||||
|
output_token_ids[index_in_input_batch] = req.output_token_ids
|
||||||
|
prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
|
||||||
|
presence_penalties[
|
||||||
|
index_in_input_batch] = req.sampling_params.presence_penalty
|
||||||
|
frequency_penalties[
|
||||||
|
index_in_input_batch] = req.sampling_params.frequency_penalty
|
||||||
|
repetition_penalties[
|
||||||
|
index_in_input_batch] = req.sampling_params.repetition_penalty
|
||||||
|
top_k[index_in_input_batch] = req.sampling_params.top_k
|
||||||
|
top_p[index_in_input_batch] = req.sampling_params.top_p
|
||||||
|
temperature[index_in_input_batch] = req.sampling_params.temperature
|
||||||
|
stop_token_ids[
|
||||||
|
index_in_input_batch] = req.sampling_params.all_stop_token_ids
|
||||||
|
min_tokens[index_in_input_batch] = req.sampling_params.min_tokens
|
||||||
|
|
||||||
|
|
||||||
|
return SamplingMetadata(
|
||||||
|
temperature=torch.tensor(temperature, dtype=torch.float, device=device),
|
||||||
|
all_greedy=False,
|
||||||
|
all_random=True,
|
||||||
|
top_p=torch.tensor(top_p, dtype=torch.float, device=device),
|
||||||
|
top_k=torch.tensor(top_k, dtype=torch.int, device=device),
|
||||||
|
no_top_p=all(x == 1.0 for x in top_p),
|
||||||
|
no_top_k=all(x == 0 for x in top_k),
|
||||||
|
generators={},
|
||||||
|
max_num_logprobs=0,
|
||||||
|
prompt_token_ids= make_tensor_with_pad(
|
||||||
|
prompt_token_ids,
|
||||||
|
pad=VOCAB_SIZE,
|
||||||
|
device=torch.device(device),
|
||||||
|
dtype=torch.int64,
|
||||||
|
),
|
||||||
|
frequency_penalties=torch.tensor(
|
||||||
|
frequency_penalties, dtype=torch.float,
|
||||||
|
device=device),
|
||||||
|
presence_penalties=torch.tensor(
|
||||||
|
presence_penalties, dtype=torch.float,
|
||||||
|
device=device),
|
||||||
|
repetition_penalties=torch.tensor(
|
||||||
|
repetition_penalties, dtype=torch.float,
|
||||||
|
device=device),
|
||||||
|
output_token_ids=output_token_ids,
|
||||||
|
min_tokens=min_tokens,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
no_penalties=(all(x ==0 for x in presence_penalties) and \
|
||||||
|
all(x ==0 for x in frequency_penalties) and \
|
||||||
|
all(x ==1 for x in repetition_penalties))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _create_sampling_params():
|
||||||
|
return SamplingParams(top_k=np.random.randint(1, 10),
|
||||||
|
top_p=np.random.uniform(0.0, 1.0),
|
||||||
|
presence_penalty=np.random.uniform(-2.0, 2.0),
|
||||||
|
repetition_penalty=np.random.uniform(0.0, 2.0),
|
||||||
|
frequency_penalty=np.random.uniform(-2.0, 2.0),
|
||||||
|
min_tokens=np.random.randint(1, 10),
|
||||||
|
stop_token_ids=[
|
||||||
|
np.random.randint(0, VOCAB_SIZE)
|
||||||
|
for _ in range(np.random.randint(10))
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def _construct_cached_request_state(req_id_suffix: int):
|
||||||
|
prompt_token_ids = [
|
||||||
|
np.random.randint(0, VOCAB_SIZE)
|
||||||
|
for _ in range(np.random.randint(0, MAX_PROMPT_SIZE))
|
||||||
|
]
|
||||||
|
output_token_ids = [
|
||||||
|
np.random.randint(0, VOCAB_SIZE)
|
||||||
|
for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
|
||||||
|
]
|
||||||
|
return CachedRequestState(req_id=f"req_id_{req_id_suffix}",
|
||||||
|
prompt_token_ids=prompt_token_ids,
|
||||||
|
prompt=None,
|
||||||
|
sampling_params=_create_sampling_params(),
|
||||||
|
mm_inputs=[],
|
||||||
|
mm_positions=[],
|
||||||
|
block_ids=[],
|
||||||
|
generator=None,
|
||||||
|
num_computed_tokens=len(output_token_ids),
|
||||||
|
output_token_ids=output_token_ids)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("device", CUDA_DEVICES)
|
||||||
|
@pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
|
||||||
|
def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
|
||||||
|
"""
|
||||||
|
Tests the logic for managing sampling metadata in the InputBatch.
|
||||||
|
|
||||||
|
This test involves adding a set of requests to the InputBatch,
|
||||||
|
followed by removing a subset of them. Afterward, the batch is compacted,
|
||||||
|
and the `make_sampling_metadata` method is invoked on the batch. The
|
||||||
|
output of `make_sampling_metadata` is then compared against the expected
|
||||||
|
results to ensure correctness.
|
||||||
|
"""
|
||||||
|
input_batch: InputBatch = InputBatch(max_num_reqs=batch_size,
|
||||||
|
max_model_len=1024,
|
||||||
|
max_num_blocks_per_req=10,
|
||||||
|
device=torch.device(device),
|
||||||
|
pin_memory=is_pin_memory_available(),
|
||||||
|
vocab_size=1024)
|
||||||
|
reqs: List[CachedRequestState] = []
|
||||||
|
req_id_reqs = {}
|
||||||
|
req_id_output_token_ids = {}
|
||||||
|
# Add requests
|
||||||
|
for req_index in range(batch_size):
|
||||||
|
req: CachedRequestState = _construct_cached_request_state(req_index)
|
||||||
|
input_batch.add_request(req, req_index)
|
||||||
|
reqs.append(req)
|
||||||
|
req_id_reqs[req.req_id] = req
|
||||||
|
req_id_output_token_ids[req.req_id] = req.output_token_ids
|
||||||
|
|
||||||
|
# Remove some requests
|
||||||
|
req_ids_to_remove, req_indices_to_remove = _remove_requests(
|
||||||
|
input_batch, batch_size, reqs)
|
||||||
|
req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
|
||||||
|
|
||||||
|
# Compact the input batch
|
||||||
|
input_batch.condense(req_indices_to_remove)
|
||||||
|
|
||||||
|
# Generate the sampling metadata
|
||||||
|
sampling_metadata = input_batch.make_sampling_metadata(
|
||||||
|
req_id_output_token_ids, skip_copy=False)
|
||||||
|
|
||||||
|
# Create expected output.
|
||||||
|
expected_sampling_metadata = _construct_expected_sampling_metadata(
|
||||||
|
reqs,
|
||||||
|
req_ids_retained,
|
||||||
|
input_batch.req_id_to_index,
|
||||||
|
device=torch.device(device))
|
||||||
|
|
||||||
|
# Assert the actual and expected output.
|
||||||
|
assert torch.allclose(expected_sampling_metadata.temperature,
|
||||||
|
sampling_metadata.temperature)
|
||||||
|
assert torch.allclose(expected_sampling_metadata.top_p,
|
||||||
|
sampling_metadata.top_p)
|
||||||
|
assert torch.allclose(expected_sampling_metadata.top_k,
|
||||||
|
sampling_metadata.top_k)
|
||||||
|
assert torch.allclose(expected_sampling_metadata.frequency_penalties,
|
||||||
|
sampling_metadata.frequency_penalties)
|
||||||
|
assert torch.allclose(expected_sampling_metadata.presence_penalties,
|
||||||
|
sampling_metadata.presence_penalties)
|
||||||
|
assert torch.allclose(expected_sampling_metadata.repetition_penalties,
|
||||||
|
sampling_metadata.repetition_penalties)
|
||||||
|
assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
|
||||||
|
sampling_metadata.prompt_token_ids)
|
||||||
|
assert (expected_sampling_metadata.output_token_ids ==
|
||||||
|
sampling_metadata.output_token_ids)
|
||||||
|
assert (
|
||||||
|
expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens)
|
||||||
|
assert (expected_sampling_metadata.stop_token_ids ==
|
||||||
|
sampling_metadata.stop_token_ids)
|
||||||
|
assert (expected_sampling_metadata.no_penalties ==
|
||||||
|
sampling_metadata.no_penalties)
|
||||||
|
assert (expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p)
|
||||||
|
assert (expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k)
|
||||||
@ -1,11 +1,17 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, Tuple
|
from typing import Literal
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import librosa
|
import numpy.typing as npt
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
|
from vllm.utils import PlaceholderModule
|
||||||
|
|
||||||
|
from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
|
||||||
|
|
||||||
|
try:
|
||||||
|
import librosa
|
||||||
|
except ImportError:
|
||||||
|
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||||
|
|
||||||
ASSET_DIR = "multimodal_asset"
|
ASSET_DIR = "multimodal_asset"
|
||||||
|
|
||||||
@ -15,8 +21,7 @@ class AudioAsset:
|
|||||||
name: Literal["winning_call", "mary_had_lamb"]
|
name: Literal["winning_call", "mary_had_lamb"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
|
def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
|
||||||
|
|
||||||
audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
|
audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
|
||||||
s3_prefix=ASSET_DIR)
|
s3_prefix=ASSET_DIR)
|
||||||
y, sr = librosa.load(audio_path, sr=None)
|
y, sr = librosa.load(audio_path, sr=None)
|
||||||
@ -25,4 +30,4 @@ class AudioAsset:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def url(self) -> str:
|
def url(self) -> str:
|
||||||
return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
|
return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
|
||||||
|
|||||||
@ -4,9 +4,8 @@ from typing import Optional
|
|||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.connections import global_http_connection
|
from vllm.connections import global_http_connection
|
||||||
from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
|
|
||||||
|
|
||||||
vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
|
VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
|
||||||
|
|
||||||
|
|
||||||
def get_cache_dir() -> Path:
|
def get_cache_dir() -> Path:
|
||||||
@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
|
|||||||
if s3_prefix is not None:
|
if s3_prefix is not None:
|
||||||
filename = s3_prefix + "/" + filename
|
filename = s3_prefix + "/" + filename
|
||||||
global_http_connection.download_file(
|
global_http_connection.download_file(
|
||||||
f"{vLLM_S3_BUCKET_URL}/{filename}",
|
f"{VLLM_S3_BUCKET_URL}/{filename}",
|
||||||
asset_path,
|
asset_path,
|
||||||
timeout=VLLM_IMAGE_FETCH_TIMEOUT)
|
timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
|
||||||
|
|
||||||
return asset_path
|
return asset_path
|
||||||
|
|||||||
@ -4,7 +4,7 @@ from typing import Literal
|
|||||||
import torch
|
import torch
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from vllm.assets.base import get_vllm_public_assets
|
from .base import get_vllm_public_assets
|
||||||
|
|
||||||
VLM_IMAGES_DIR = "vision_model_images"
|
VLM_IMAGES_DIR = "vision_model_images"
|
||||||
|
|
||||||
@ -15,7 +15,6 @@ class ImageAsset:
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def pil_image(self) -> Image.Image:
|
def pil_image(self) -> Image.Image:
|
||||||
|
|
||||||
image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
|
image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
|
||||||
s3_prefix=VLM_IMAGES_DIR)
|
s3_prefix=VLM_IMAGES_DIR)
|
||||||
return Image.open(image_path)
|
return Image.open(image_path)
|
||||||
|
|||||||
@ -2,13 +2,13 @@ from dataclasses import dataclass
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import List, Literal
|
from typing import List, Literal
|
||||||
|
|
||||||
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from vllm.multimodal.utils import (sample_frames_from_video,
|
from vllm.multimodal.video import sample_frames_from_video
|
||||||
try_import_video_packages)
|
|
||||||
|
|
||||||
from .base import get_cache_dir
|
from .base import get_cache_dir
|
||||||
|
|
||||||
@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
|
|||||||
Download and open an image from huggingface
|
Download and open an image from huggingface
|
||||||
repo: raushan-testing-hf/videos-test
|
repo: raushan-testing-hf/videos-test
|
||||||
"""
|
"""
|
||||||
video_directory = get_cache_dir() / "video-eample-data"
|
video_directory = get_cache_dir() / "video-example-data"
|
||||||
video_directory.mkdir(parents=True, exist_ok=True)
|
video_directory.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
video_path = video_directory / filename
|
video_path = video_directory / filename
|
||||||
@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
|
def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
|
||||||
cv2, _ = try_import_video_packages()
|
|
||||||
|
|
||||||
cap = cv2.VideoCapture(path)
|
cap = cv2.VideoCapture(path)
|
||||||
if not cap.isOpened():
|
if not cap.isOpened():
|
||||||
raise ValueError(f"Could not open video file {path}")
|
raise ValueError(f"Could not open video file {path}")
|
||||||
@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
|
|||||||
|
|
||||||
def video_to_pil_images_list(path: str,
|
def video_to_pil_images_list(path: str,
|
||||||
num_frames: int = -1) -> List[Image.Image]:
|
num_frames: int = -1) -> List[Image.Image]:
|
||||||
cv2, _ = try_import_video_packages()
|
|
||||||
frames = video_to_ndarrays(path, num_frames)
|
frames = video_to_ndarrays(path, num_frames)
|
||||||
return [
|
return [
|
||||||
Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||||
|
|||||||
@ -141,14 +141,14 @@ class AlwaysHitShapeEnv:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def wrap_inductor(graph,
|
def wrap_inductor(graph: fx.GraphModule,
|
||||||
example_inputs,
|
example_inputs,
|
||||||
additional_inductor_config,
|
additional_inductor_config,
|
||||||
compilation_config: CompilationConfig,
|
compilation_config: CompilationConfig,
|
||||||
graph_index: int = 0,
|
graph_index: int = 0,
|
||||||
num_graphs: int = 1,
|
num_graphs: int = 1,
|
||||||
runtime_shape: Optional[int] = None,
|
runtime_shape: Optional[int] = None,
|
||||||
use_inductor: bool = True):
|
use_inductor: bool = True) -> Any:
|
||||||
if graph_index == 0:
|
if graph_index == 0:
|
||||||
# before compiling the first graph, record the start time
|
# before compiling the first graph, record the start time
|
||||||
global compilation_start_time
|
global compilation_start_time
|
||||||
@ -209,7 +209,7 @@ def wrap_inductor(graph,
|
|||||||
returns_tuple = graph_returns_tuple(graph)
|
returns_tuple = graph_returns_tuple(graph)
|
||||||
|
|
||||||
# this is the graph we return to Dynamo to run
|
# this is the graph we return to Dynamo to run
|
||||||
def compiled_graph(*args):
|
def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]:
|
||||||
# convert args to list
|
# convert args to list
|
||||||
list_args = list(args)
|
list_args = list(args)
|
||||||
graph_output = inductor_compiled_graph(list_args)
|
graph_output = inductor_compiled_graph(list_args)
|
||||||
@ -247,7 +247,7 @@ def wrap_inductor(graph,
|
|||||||
# see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
|
# see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
|
||||||
return
|
return
|
||||||
|
|
||||||
def _get_shape_env():
|
def _get_shape_env() -> AlwaysHitShapeEnv:
|
||||||
return AlwaysHitShapeEnv()
|
return AlwaysHitShapeEnv()
|
||||||
|
|
||||||
with patch(# for hijacking the hash of the compiled graph
|
with patch(# for hijacking the hash of the compiled graph
|
||||||
@ -537,7 +537,7 @@ class VllmBackend:
|
|||||||
example_inputs[x].clone() for x in self.sym_tensor_indices
|
example_inputs[x].clone() for x in self.sym_tensor_indices
|
||||||
]
|
]
|
||||||
|
|
||||||
def copy_and_call(*args):
|
def copy_and_call(*args) -> fx.GraphModule:
|
||||||
list_args = list(args)
|
list_args = list(args)
|
||||||
for i, index in enumerate(self.sym_tensor_indices):
|
for i, index in enumerate(self.sym_tensor_indices):
|
||||||
runtime_tensor = list_args[index]
|
runtime_tensor = list_args[index]
|
||||||
|
|||||||
@ -7,6 +7,7 @@ from torch import fx
|
|||||||
from torch._higher_order_ops.auto_functionalize import auto_functionalized
|
from torch._higher_order_ops.auto_functionalize import auto_functionalized
|
||||||
from torch._inductor import pattern_matcher as pm
|
from torch._inductor import pattern_matcher as pm
|
||||||
from torch._ops import OpOverload
|
from torch._ops import OpOverload
|
||||||
|
from torch.fx import Node
|
||||||
|
|
||||||
from vllm.compilation.fx_utils import find_auto_fn
|
from vllm.compilation.fx_utils import find_auto_fn
|
||||||
|
|
||||||
@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC):
|
|||||||
self.graph.call_function(operator.getitem, (tuple_node, idx))
|
self.graph.call_function(operator.getitem, (tuple_node, idx))
|
||||||
for idx in indices)
|
for idx in indices)
|
||||||
|
|
||||||
def insert_auto_fn(self, op: OpOverload, kwargs):
|
def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
|
||||||
"""
|
"""
|
||||||
Insert an auto_functionalized node with the given op and kwargs.
|
Insert an auto_functionalized node with the given op and kwargs.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from typing import List
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
from torch import fx as fx
|
from torch import fx as fx
|
||||||
|
|
||||||
@ -53,7 +53,7 @@ class PostGradPassManager:
|
|||||||
assert isinstance(pass_, InductorPass)
|
assert isinstance(pass_, InductorPass)
|
||||||
self.passes.append(pass_)
|
self.passes.append(pass_)
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self) -> Dict[str, List[Any]]:
|
||||||
"""
|
"""
|
||||||
Custom pickling for the pass manager, as some passes cannot be pickled.
|
Custom pickling for the pass manager, as some passes cannot be pickled.
|
||||||
Pickling occurs because the pass manager is set as the value of
|
Pickling occurs because the pass manager is set as the value of
|
||||||
|
|||||||
@ -29,6 +29,7 @@ from vllm.transformers_utils.config import (
|
|||||||
get_hf_text_config, get_pooling_config,
|
get_hf_text_config, get_pooling_config,
|
||||||
get_sentence_transformer_tokenizer_config, is_encoder_decoder,
|
get_sentence_transformer_tokenizer_config, is_encoder_decoder,
|
||||||
try_get_generation_config, uses_mrope)
|
try_get_generation_config, uses_mrope)
|
||||||
|
from vllm.transformers_utils.s3_utils import S3Model
|
||||||
from vllm.transformers_utils.utils import is_s3
|
from vllm.transformers_utils.utils import is_s3
|
||||||
from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
|
from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
|
||||||
get_cpu_memory, print_warning_once, random_uuid,
|
get_cpu_memory, print_warning_once, random_uuid,
|
||||||
@ -372,15 +373,6 @@ class ModelConfig:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
if is_s3(model) or is_s3(tokenizer):
|
if is_s3(model) or is_s3(tokenizer):
|
||||||
try:
|
|
||||||
from vllm.transformers_utils.s3_utils import S3Model
|
|
||||||
except ImportError as err:
|
|
||||||
raise ImportError(
|
|
||||||
"Please install Run:ai optional dependency "
|
|
||||||
"to use the S3 capabilities. "
|
|
||||||
"You can install it with: pip install vllm[runai]"
|
|
||||||
) from err
|
|
||||||
|
|
||||||
if is_s3(model):
|
if is_s3(model):
|
||||||
self.s3_model = S3Model()
|
self.s3_model = S3Model()
|
||||||
self.s3_model.pull_files(model, allow_pattern=["*config.json"])
|
self.s3_model.pull_files(model, allow_pattern=["*config.json"])
|
||||||
|
|||||||
@ -231,7 +231,8 @@ class LoRAModel(AdapterModel):
|
|||||||
with safetensors.safe_open(lora_tensor_path,
|
with safetensors.safe_open(lora_tensor_path,
|
||||||
framework="pt") as f: # type: ignore
|
framework="pt") as f: # type: ignore
|
||||||
for lora_module in f.keys(): # noqa
|
for lora_module in f.keys(): # noqa
|
||||||
module_name, _, _ = parse_fine_tuned_lora_name(lora_module)
|
module_name, _, _ = parse_fine_tuned_lora_name(
|
||||||
|
lora_module, weights_mapper)
|
||||||
part_name = module_name.split(".")[-1]
|
part_name = module_name.split(".")[-1]
|
||||||
if part_name not in expected_lora_modules:
|
if part_name not in expected_lora_modules:
|
||||||
unexpected_modules.append(module_name)
|
unexpected_modules.append(module_name)
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
import copy
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import List, Optional, Set, Tuple, Type, Union
|
from typing import List, Optional, Set, Tuple, Type, Union
|
||||||
@ -32,7 +31,6 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
|||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||||
from vllm.model_executor.models.utils import WeightsMapper
|
from vllm.model_executor.models.utils import WeightsMapper
|
||||||
from vllm.utils import print_warning_once
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -112,36 +110,28 @@ def parse_fine_tuned_lora_name(
|
|||||||
is_bias whether the tensor is lora bias.
|
is_bias whether the tensor is lora bias.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
w_mapper = None
|
# LoRA weight qualified name always starts with `base_model.model.`,
|
||||||
if weights_mapper:
|
# so we remove the prefix `base_model.model.` to make the following
|
||||||
w_mapper = copy.deepcopy(weights_mapper)
|
# mapping correctly.
|
||||||
# TODO: Currently only supports mapping for prefix, mapping for
|
if "base_model.model." in name:
|
||||||
# substr and subfix will be supported in the future.
|
name = name.replace("base_model.model.", "")
|
||||||
for attr, mapping in [
|
name = weights_mapper._map_name(name) if weights_mapper else name
|
||||||
("orig_to_new_substr", w_mapper.orig_to_new_substr),
|
# recover the prefix `base_model.model.`
|
||||||
("orig_to_new_suffix", w_mapper.orig_to_new_suffix),
|
name = "base_model.model." + name
|
||||||
]:
|
|
||||||
if mapping:
|
|
||||||
print_warning_once(
|
|
||||||
f"vLLM currently does not support mapping of LoRA weights "
|
|
||||||
f"for {mapping}.")
|
|
||||||
setattr(w_mapper, attr, {})
|
|
||||||
|
|
||||||
mapper = (lambda name: w_mapper._map_name(name)
|
|
||||||
if w_mapper is not None else name)
|
|
||||||
parts = name.split(".")
|
parts = name.split(".")
|
||||||
if parts[-1] == "weight" and (parts[-2] == "lora_A"
|
if parts[-1] == "weight" and (parts[-2] == "lora_A"
|
||||||
or parts[-2] == "lora_B"):
|
or parts[-2] == "lora_B"):
|
||||||
new_name = ".".join(parts[2:-2])
|
new_name = ".".join(parts[2:-2])
|
||||||
return mapper(new_name), parts[-2] == "lora_A", False
|
return new_name, parts[-2] == "lora_A", False
|
||||||
|
|
||||||
if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
|
if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
|
||||||
new_name = ".".join(parts[2:-1])
|
new_name = ".".join(parts[2:-1])
|
||||||
return mapper(new_name), parts[-1] == "lora_embedding_A", False
|
return new_name, parts[-1] == "lora_embedding_A", False
|
||||||
|
|
||||||
if parts[-1] == "bias":
|
if parts[-1] == "bias":
|
||||||
new_name = ".".join(parts[2:-2])
|
new_name = ".".join(parts[2:-2])
|
||||||
return mapper(new_name), False, True
|
return new_name, False, True
|
||||||
|
|
||||||
raise ValueError(f"{name} is unsupported LoRA weight")
|
raise ValueError(f"{name} is unsupported LoRA weight")
|
||||||
|
|
||||||
|
|||||||
@ -91,6 +91,8 @@ class WorkerLoRAManager(AbstractWorkerManager):
|
|||||||
packed_modules_mapping[module])
|
packed_modules_mapping[module])
|
||||||
else:
|
else:
|
||||||
expected_lora_modules.append(module)
|
expected_lora_modules.append(module)
|
||||||
|
|
||||||
|
expected_lora_modules = list(set(expected_lora_modules))
|
||||||
lora_path = get_adapter_absolute_path(lora_request.lora_path)
|
lora_path = get_adapter_absolute_path(lora_request.lora_path)
|
||||||
|
|
||||||
# For some models like Qwen2VL, we need to use hf_to_vllm_mapper
|
# For some models like Qwen2VL, we need to use hf_to_vllm_mapper
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
|
from vllm.model_executor.layers.utils import apply_penalties
|
||||||
from vllm.model_executor.sampling_metadata import (SamplingMetadata,
|
from vllm.model_executor.sampling_metadata import (SamplingMetadata,
|
||||||
SamplingTensors,
|
SamplingTensors,
|
||||||
SequenceGroupToSample)
|
SequenceGroupToSample)
|
||||||
@ -258,11 +259,11 @@ class Sampler(nn.Module):
|
|||||||
|
|
||||||
# Apply presence and frequency penalties.
|
# Apply presence and frequency penalties.
|
||||||
if do_penalties:
|
if do_penalties:
|
||||||
logits = _apply_penalties(logits, sampling_tensors.prompt_tokens,
|
logits = apply_penalties(logits, sampling_tensors.prompt_tokens,
|
||||||
sampling_tensors.output_tokens,
|
sampling_tensors.output_tokens,
|
||||||
sampling_tensors.presence_penalties,
|
sampling_tensors.presence_penalties,
|
||||||
sampling_tensors.frequency_penalties,
|
sampling_tensors.frequency_penalties,
|
||||||
sampling_tensors.repetition_penalties)
|
sampling_tensors.repetition_penalties)
|
||||||
|
|
||||||
# Use float32 to apply temperature scaling.
|
# Use float32 to apply temperature scaling.
|
||||||
# Use in-place division to avoid creating a new tensor.
|
# Use in-place division to avoid creating a new tensor.
|
||||||
@ -336,23 +337,6 @@ class Sampler(nn.Module):
|
|||||||
return self.should_modify_greedy_probs_inplace
|
return self.should_modify_greedy_probs_inplace
|
||||||
|
|
||||||
|
|
||||||
def _get_bin_counts_and_mask(
|
|
||||||
tokens: torch.Tensor,
|
|
||||||
vocab_size: int,
|
|
||||||
num_seqs: int,
|
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
# Compute the bin counts for the tokens.
|
|
||||||
# vocab_size + 1 for padding.
|
|
||||||
bin_counts = torch.zeros((num_seqs, vocab_size + 1),
|
|
||||||
dtype=torch.long,
|
|
||||||
device=tokens.device)
|
|
||||||
bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
|
|
||||||
bin_counts = bin_counts[:, :vocab_size]
|
|
||||||
mask = bin_counts > 0
|
|
||||||
|
|
||||||
return bin_counts, mask
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_min_tokens_penalty(
|
def _apply_min_tokens_penalty(
|
||||||
logits: torch.Tensor,
|
logits: torch.Tensor,
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata: SamplingMetadata,
|
||||||
@ -400,29 +384,6 @@ def _apply_min_tokens_penalty(
|
|||||||
return logits
|
return logits
|
||||||
|
|
||||||
|
|
||||||
def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
|
|
||||||
output_tokens_tensor: torch.Tensor,
|
|
||||||
presence_penalties: torch.Tensor,
|
|
||||||
frequency_penalties: torch.Tensor,
|
|
||||||
repetition_penalties: torch.Tensor) -> torch.Tensor:
|
|
||||||
num_seqs, vocab_size = logits.shape
|
|
||||||
_, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size,
|
|
||||||
num_seqs)
|
|
||||||
output_bin_counts, output_mask = _get_bin_counts_and_mask(
|
|
||||||
output_tokens_tensor, vocab_size, num_seqs)
|
|
||||||
|
|
||||||
repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size)
|
|
||||||
repetition_penalties[~(prompt_mask | output_mask)] = 1.0
|
|
||||||
logits = torch.where(logits > 0, logits / repetition_penalties,
|
|
||||||
logits * repetition_penalties)
|
|
||||||
|
|
||||||
# We follow the definition in OpenAI API.
|
|
||||||
# Refer to https://platform.openai.com/docs/api-reference/parameter-details
|
|
||||||
logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
|
|
||||||
logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
|
|
||||||
return logits
|
|
||||||
|
|
||||||
|
|
||||||
def _apply_top_k_top_p(
|
def _apply_top_k_top_p(
|
||||||
logits: torch.Tensor,
|
logits: torch.Tensor,
|
||||||
p: torch.Tensor,
|
p: torch.Tensor,
|
||||||
|
|||||||
57
vllm/model_executor/layers/utils.py
Normal file
57
vllm/model_executor/layers/utils.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
"""Utility methods for model layers."""
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def get_token_bin_counts_and_mask(
|
||||||
|
tokens: torch.Tensor,
|
||||||
|
vocab_size: int,
|
||||||
|
num_seqs: int,
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
# Compute the bin counts for the tokens.
|
||||||
|
# vocab_size + 1 for padding.
|
||||||
|
bin_counts = torch.zeros((num_seqs, vocab_size + 1),
|
||||||
|
dtype=torch.long,
|
||||||
|
device=tokens.device)
|
||||||
|
bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
|
||||||
|
bin_counts = bin_counts[:, :vocab_size]
|
||||||
|
mask = bin_counts > 0
|
||||||
|
|
||||||
|
return bin_counts, mask
|
||||||
|
|
||||||
|
|
||||||
|
def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
|
||||||
|
output_tokens_tensor: torch.Tensor,
|
||||||
|
presence_penalties: torch.Tensor,
|
||||||
|
frequency_penalties: torch.Tensor,
|
||||||
|
repetition_penalties: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Applies penalties in place to the logits tensor
|
||||||
|
logits : The input logits tensor of shape [num_seqs, vocab_size]
|
||||||
|
prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts
|
||||||
|
are padded to the maximum prompt length within the batch using
|
||||||
|
`vocab_size` as the padding value. The value `vocab_size` is used
|
||||||
|
for padding because it does not correspond to any valid token ID
|
||||||
|
in the vocabulary.
|
||||||
|
output_tokens_tensor: The output tokens tensor.
|
||||||
|
presence_penalties: The presence penalties of shape (num_seqs, )
|
||||||
|
frequency_penalties: The frequency penalties of shape (num_seqs, )
|
||||||
|
repetition_penalties: The repetition penalties of shape (num_seqs, )
|
||||||
|
"""
|
||||||
|
num_seqs, vocab_size = logits.shape
|
||||||
|
_, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor,
|
||||||
|
vocab_size, num_seqs)
|
||||||
|
output_bin_counts, output_mask = get_token_bin_counts_and_mask(
|
||||||
|
output_tokens_tensor, vocab_size, num_seqs)
|
||||||
|
repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat(
|
||||||
|
1, vocab_size)
|
||||||
|
logits[logits > 0] /= torch.where(prompt_mask | output_mask,
|
||||||
|
repetition_penalties, 1.0)[logits > 0]
|
||||||
|
logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
|
||||||
|
repetition_penalties, 1.0)[logits <= 0]
|
||||||
|
# We follow the definition in OpenAI API.
|
||||||
|
# Refer to https://platform.openai.com/docs/api-reference/parameter-details
|
||||||
|
logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
|
||||||
|
logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
|
||||||
|
return logits
|
||||||
@ -48,6 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import (
|
|||||||
runai_safetensors_weights_iterator, safetensors_weights_iterator)
|
runai_safetensors_weights_iterator, safetensors_weights_iterator)
|
||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.transformers_utils.s3_utils import glob as s3_glob
|
||||||
from vllm.transformers_utils.utils import is_s3
|
from vllm.transformers_utils.utils import is_s3
|
||||||
from vllm.utils import is_pin_memory_available
|
from vllm.utils import is_pin_memory_available
|
||||||
|
|
||||||
@ -1269,16 +1270,6 @@ class RunaiModelStreamerLoader(BaseModelLoader):
|
|||||||
|
|
||||||
If the model is not local, it will be downloaded."""
|
If the model is not local, it will be downloaded."""
|
||||||
is_s3_path = is_s3(model_name_or_path)
|
is_s3_path = is_s3(model_name_or_path)
|
||||||
if is_s3_path:
|
|
||||||
try:
|
|
||||||
from vllm.transformers_utils.s3_utils import glob as s3_glob
|
|
||||||
except ImportError as err:
|
|
||||||
raise ImportError(
|
|
||||||
"Please install Run:ai optional dependency "
|
|
||||||
"to use the S3 capabilities. "
|
|
||||||
"You can install it with: pip install vllm[runai]"
|
|
||||||
) from err
|
|
||||||
|
|
||||||
is_local = os.path.isdir(model_name_or_path)
|
is_local = os.path.isdir(model_name_or_path)
|
||||||
safetensors_pattern = "*.safetensors"
|
safetensors_pattern = "*.safetensors"
|
||||||
index_file = SAFE_WEIGHTS_INDEX_NAME
|
index_file = SAFE_WEIGHTS_INDEX_NAME
|
||||||
|
|||||||
@ -19,9 +19,7 @@ from vllm.engine.llm_engine import LLMEngine
|
|||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
VocabParallelEmbedding)
|
VocabParallelEmbedding)
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser, PlaceholderModule
|
||||||
|
|
||||||
tensorizer_error_msg = None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from tensorizer import (DecryptionParams, EncryptionParams,
|
from tensorizer import (DecryptionParams, EncryptionParams,
|
||||||
@ -34,8 +32,19 @@ try:
|
|||||||
open_stream,
|
open_stream,
|
||||||
mode=mode,
|
mode=mode,
|
||||||
) for mode in ("rb", "wb+"))
|
) for mode in ("rb", "wb+"))
|
||||||
except ImportError as e:
|
except ImportError:
|
||||||
tensorizer_error_msg = str(e)
|
tensorizer = PlaceholderModule("tensorizer")
|
||||||
|
DecryptionParams = tensorizer.placeholder_attr("DecryptionParams")
|
||||||
|
EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
|
||||||
|
TensorDeserializer = tensorizer.placeholder_attr("TensorDeserializer")
|
||||||
|
TensorSerializer = tensorizer.placeholder_attr("TensorSerializer")
|
||||||
|
open_stream = tensorizer.placeholder_attr("stream_io.open_stream")
|
||||||
|
convert_bytes = tensorizer.placeholder_attr("utils.convert_bytes")
|
||||||
|
get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage")
|
||||||
|
no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor")
|
||||||
|
|
||||||
|
_read_stream = tensorizer.placeholder_attr("_read_stream")
|
||||||
|
_write_stream = tensorizer.placeholder_attr("_write_stream")
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
|
'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
|
||||||
@ -267,12 +276,6 @@ class TensorizerAgent:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
|
def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
|
||||||
if tensorizer_error_msg is not None:
|
|
||||||
raise ImportError(
|
|
||||||
"Tensorizer is not installed. Please install tensorizer "
|
|
||||||
"to use this feature with `pip install vllm[tensorizer]`. "
|
|
||||||
"Error message: {}".format(tensorizer_error_msg))
|
|
||||||
|
|
||||||
self.tensorizer_config = tensorizer_config
|
self.tensorizer_config = tensorizer_config
|
||||||
self.tensorizer_args = (
|
self.tensorizer_args = (
|
||||||
self.tensorizer_config._construct_tensorizer_args())
|
self.tensorizer_config._construct_tensorizer_args())
|
||||||
|
|||||||
@ -25,7 +25,15 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig,
|
|||||||
get_quantization_config)
|
get_quantization_config)
|
||||||
from vllm.model_executor.layers.quantization.schema import QuantParamSchema
|
from vllm.model_executor.layers.quantization.schema import QuantParamSchema
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import print_warning_once
|
from vllm.utils import PlaceholderModule, print_warning_once
|
||||||
|
|
||||||
|
try:
|
||||||
|
from runai_model_streamer import SafetensorsStreamer
|
||||||
|
except ImportError:
|
||||||
|
runai_model_streamer = PlaceholderModule(
|
||||||
|
"runai_model_streamer") # type: ignore[assignment]
|
||||||
|
SafetensorsStreamer = runai_model_streamer.placeholder_attr(
|
||||||
|
"SafetensorsStreamer")
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -414,13 +422,6 @@ def runai_safetensors_weights_iterator(
|
|||||||
hf_weights_files: List[str]
|
hf_weights_files: List[str]
|
||||||
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
||||||
"""Iterate over the weights in the model safetensor files."""
|
"""Iterate over the weights in the model safetensor files."""
|
||||||
try:
|
|
||||||
from runai_model_streamer import SafetensorsStreamer
|
|
||||||
except ImportError as err:
|
|
||||||
raise ImportError(
|
|
||||||
"Please install Run:ai optional dependency."
|
|
||||||
"You can install it with: pip install vllm[runai]") from err
|
|
||||||
|
|
||||||
enable_tqdm = not torch.distributed.is_initialized(
|
enable_tqdm = not torch.distributed.is_initialized(
|
||||||
) or torch.distributed.get_rank() == 0
|
) or torch.distributed.get_rank() == 0
|
||||||
with SafetensorsStreamer() as streamer:
|
with SafetensorsStreamer() as streamer:
|
||||||
|
|||||||
@ -464,24 +464,27 @@ class MolmoAttention(nn.Module):
|
|||||||
class MolmoMLP(nn.Module):
|
class MolmoMLP(nn.Module):
|
||||||
"""Molmo's LLM mlp."""
|
"""Molmo's LLM mlp."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self,
|
||||||
self,
|
config: PretrainedConfig,
|
||||||
config: PretrainedConfig,
|
input_dim: Optional[int] = None,
|
||||||
input_dim: Optional[int] = None,
|
quant_config: Optional[QuantizationConfig] = None,
|
||||||
quant_config: Optional[QuantizationConfig] = None,
|
proj_name: str = "gate_up_proj") -> None:
|
||||||
) -> None:
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.hidden_size = config.hidden_size
|
self.hidden_size = config.hidden_size
|
||||||
self.intermediate_size = config.intermediate_size // 2
|
self.intermediate_size = config.intermediate_size // 2
|
||||||
|
|
||||||
# Feed-forward input projection.
|
# Molmo's LLM proj weights are already merged into the disk, while
|
||||||
self.gate_up_proj = MergedColumnParallelLinear(
|
# image_projector proj is separate. If the same proj_name were used, it
|
||||||
input_dim or self.hidden_size,
|
# would create ambiguity and make it difficult to support BNB and LoRA.
|
||||||
[self.intermediate_size] * 2,
|
self.proj_name = proj_name
|
||||||
bias=False,
|
setattr(
|
||||||
quant_config=quant_config,
|
self, proj_name,
|
||||||
)
|
MergedColumnParallelLinear(
|
||||||
|
input_dim or self.hidden_size,
|
||||||
|
[self.intermediate_size] * 2,
|
||||||
|
bias=False,
|
||||||
|
quant_config=quant_config,
|
||||||
|
))
|
||||||
# Activation function.
|
# Activation function.
|
||||||
self.act_fn = SiluAndMul()
|
self.act_fn = SiluAndMul()
|
||||||
|
|
||||||
@ -497,7 +500,7 @@ class MolmoMLP(nn.Module):
|
|||||||
self,
|
self,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
gate_up, _ = self.gate_up_proj(x)
|
gate_up, _ = getattr(self, self.proj_name)(x)
|
||||||
x = self.act_fn(gate_up)
|
x = self.act_fn(gate_up)
|
||||||
x, _ = self.down_proj(x)
|
x, _ = self.down_proj(x)
|
||||||
return x
|
return x
|
||||||
@ -520,7 +523,9 @@ class MolmoDecoderLayer(nn.Module):
|
|||||||
prefix=f"{prefix}.self_attn")
|
prefix=f"{prefix}.self_attn")
|
||||||
|
|
||||||
# MLP block.
|
# MLP block.
|
||||||
self.mlp = MolmoMLP(config, quant_config=quant_config)
|
self.mlp = MolmoMLP(config,
|
||||||
|
quant_config=quant_config,
|
||||||
|
proj_name="gate_up_proj")
|
||||||
|
|
||||||
# LayerNorm
|
# LayerNorm
|
||||||
assert config.layer_norm_type == "rms"
|
assert config.layer_norm_type == "rms"
|
||||||
@ -616,6 +621,7 @@ class MolmoVisionBackbone(nn.Module):
|
|||||||
config,
|
config,
|
||||||
input_dim=vision_config.image_emb_dim,
|
input_dim=vision_config.image_emb_dim,
|
||||||
quant_config=quant_config,
|
quant_config=quant_config,
|
||||||
|
proj_name="merged_linear",
|
||||||
)
|
)
|
||||||
|
|
||||||
image_dim = vision_config.image_emb_dim * len(self.vit_layers)
|
image_dim = vision_config.image_emb_dim * len(self.vit_layers)
|
||||||
@ -714,8 +720,8 @@ class MolmoVisionBackbone(nn.Module):
|
|||||||
torch.Tensor]]) -> Set[str]:
|
torch.Tensor]]) -> Set[str]:
|
||||||
stacked_params_mapping = [
|
stacked_params_mapping = [
|
||||||
# (param_name, shard_name, shard_id)
|
# (param_name, shard_name, shard_id)
|
||||||
("gate_up_proj", "gate_proj", 0),
|
("merged_linear", "gate_proj", 0),
|
||||||
("gate_up_proj", "up_proj", 1),
|
("merged_linear", "up_proj", 1),
|
||||||
]
|
]
|
||||||
params_dict = dict(self.named_parameters())
|
params_dict = dict(self.named_parameters())
|
||||||
loaded_params: Set[str] = set()
|
loaded_params: Set[str] = set()
|
||||||
|
|||||||
@ -2,10 +2,16 @@ import numpy as np
|
|||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
|
|
||||||
from vllm.inputs.registry import InputContext
|
from vllm.inputs.registry import InputContext
|
||||||
|
from vllm.utils import PlaceholderModule
|
||||||
|
|
||||||
from .base import MultiModalPlugin
|
from .base import MultiModalPlugin
|
||||||
from .inputs import AudioItem, MultiModalData, MultiModalKwargs
|
from .inputs import AudioItem, MultiModalData, MultiModalKwargs
|
||||||
|
|
||||||
|
try:
|
||||||
|
import librosa
|
||||||
|
except ImportError:
|
||||||
|
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||||
|
|
||||||
|
|
||||||
class AudioPlugin(MultiModalPlugin):
|
class AudioPlugin(MultiModalPlugin):
|
||||||
"""Plugin for audio data."""
|
"""Plugin for audio data."""
|
||||||
@ -32,10 +38,4 @@ def resample_audio(
|
|||||||
orig_sr: float,
|
orig_sr: float,
|
||||||
target_sr: float,
|
target_sr: float,
|
||||||
) -> npt.NDArray[np.floating]:
|
) -> npt.NDArray[np.floating]:
|
||||||
try:
|
|
||||||
import librosa
|
|
||||||
except ImportError as exc:
|
|
||||||
msg = "Please install vllm[audio] for audio support."
|
|
||||||
raise ImportError(msg) from exc
|
|
||||||
|
|
||||||
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
||||||
|
|||||||
@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin):
|
|||||||
|
|
||||||
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
||||||
return 3000
|
return 3000
|
||||||
|
|
||||||
|
|
||||||
|
def rescale_image_size(image: Image.Image,
|
||||||
|
size_factor: float,
|
||||||
|
transpose: int = -1) -> Image.Image:
|
||||||
|
"""Rescale the dimensions of an image by a constant factor."""
|
||||||
|
new_width = int(image.width * size_factor)
|
||||||
|
new_height = int(image.height * size_factor)
|
||||||
|
image = image.resize((new_width, new_height))
|
||||||
|
if transpose >= 0:
|
||||||
|
image = image.transpose(Image.Transpose(transpose))
|
||||||
|
return image
|
||||||
|
|||||||
@ -2,7 +2,7 @@ import base64
|
|||||||
import os
|
import os
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Any, List, Optional, Tuple, TypeVar, Union
|
from typing import List, Optional, Tuple, TypeVar, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
@ -13,9 +13,25 @@ import vllm.envs as envs
|
|||||||
from vllm.connections import global_http_connection
|
from vllm.connections import global_http_connection
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
|
||||||
|
from vllm.utils import PlaceholderModule
|
||||||
|
|
||||||
from .inputs import MultiModalDataDict, PlaceholderRange
|
from .inputs import MultiModalDataDict, PlaceholderRange
|
||||||
|
|
||||||
|
try:
|
||||||
|
import decord
|
||||||
|
except ImportError:
|
||||||
|
decord = PlaceholderModule("decord") # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
import librosa
|
||||||
|
except ImportError:
|
||||||
|
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
import soundfile
|
||||||
|
except ImportError:
|
||||||
|
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||||
@ -126,8 +142,6 @@ async def async_fetch_image(image_url: str,
|
|||||||
|
|
||||||
|
|
||||||
def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
|
def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
|
||||||
_, decord = try_import_video_packages()
|
|
||||||
|
|
||||||
video_path = BytesIO(b)
|
video_path = BytesIO(b)
|
||||||
vr = decord.VideoReader(video_path, num_threads=1)
|
vr = decord.VideoReader(video_path, num_threads=1)
|
||||||
total_frame_num = len(vr)
|
total_frame_num = len(vr)
|
||||||
@ -198,22 +212,10 @@ async def async_fetch_video(video_url: str,
|
|||||||
return video
|
return video
|
||||||
|
|
||||||
|
|
||||||
def try_import_audio_packages() -> Tuple[Any, Any]:
|
|
||||||
try:
|
|
||||||
import librosa
|
|
||||||
import soundfile
|
|
||||||
except ImportError as exc:
|
|
||||||
raise ImportError(
|
|
||||||
"Please install vllm[audio] for audio support.") from exc
|
|
||||||
return librosa, soundfile
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
|
def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
|
||||||
"""
|
"""
|
||||||
Load audio from a URL.
|
Load audio from a URL.
|
||||||
"""
|
"""
|
||||||
librosa, _ = try_import_audio_packages()
|
|
||||||
|
|
||||||
if audio_url.startswith("http"):
|
if audio_url.startswith("http"):
|
||||||
audio_bytes = global_http_connection.get_bytes(
|
audio_bytes = global_http_connection.get_bytes(
|
||||||
audio_url,
|
audio_url,
|
||||||
@ -234,8 +236,6 @@ async def async_fetch_audio(
|
|||||||
"""
|
"""
|
||||||
Asynchronously fetch audio from a URL.
|
Asynchronously fetch audio from a URL.
|
||||||
"""
|
"""
|
||||||
librosa, _ = try_import_audio_packages()
|
|
||||||
|
|
||||||
if audio_url.startswith("http"):
|
if audio_url.startswith("http"):
|
||||||
audio_bytes = await global_http_connection.async_get_bytes(
|
audio_bytes = await global_http_connection.async_get_bytes(
|
||||||
audio_url,
|
audio_url,
|
||||||
@ -294,8 +294,6 @@ def encode_audio_base64(
|
|||||||
sampling_rate: int,
|
sampling_rate: int,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Encode audio as base64."""
|
"""Encode audio as base64."""
|
||||||
_, soundfile = try_import_audio_packages()
|
|
||||||
|
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
soundfile.write(buffered, audio, sampling_rate, format="WAV")
|
soundfile.write(buffered, audio, sampling_rate, format="WAV")
|
||||||
|
|
||||||
@ -324,60 +322,6 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
|
|||||||
return _load_image_from_bytes(base64.b64decode(image))
|
return _load_image_from_bytes(base64.b64decode(image))
|
||||||
|
|
||||||
|
|
||||||
def rescale_image_size(image: Image.Image,
|
|
||||||
size_factor: float,
|
|
||||||
transpose: int = -1) -> Image.Image:
|
|
||||||
"""Rescale the dimensions of an image by a constant factor."""
|
|
||||||
new_width = int(image.width * size_factor)
|
|
||||||
new_height = int(image.height * size_factor)
|
|
||||||
image = image.resize((new_width, new_height))
|
|
||||||
if transpose >= 0:
|
|
||||||
image = image.transpose(Image.Transpose(transpose))
|
|
||||||
return image
|
|
||||||
|
|
||||||
|
|
||||||
def try_import_video_packages():
|
|
||||||
try:
|
|
||||||
import cv2
|
|
||||||
import decord
|
|
||||||
except ImportError as exc:
|
|
||||||
raise ImportError(
|
|
||||||
"Please install vllm[video] for video support.") from exc
|
|
||||||
return cv2, decord
|
|
||||||
|
|
||||||
|
|
||||||
def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
|
|
||||||
cv2, _ = try_import_video_packages()
|
|
||||||
|
|
||||||
num_frames, _, _, channels = frames.shape
|
|
||||||
new_height, new_width = size
|
|
||||||
resized_frames = np.empty((num_frames, new_height, new_width, channels),
|
|
||||||
dtype=frames.dtype)
|
|
||||||
for i, frame in enumerate(frames):
|
|
||||||
resized_frame = cv2.resize(frame, (new_width, new_height))
|
|
||||||
resized_frames[i] = resized_frame
|
|
||||||
return resized_frames
|
|
||||||
|
|
||||||
|
|
||||||
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
|
|
||||||
_, height, width, _ = frames.shape
|
|
||||||
new_height = int(height * size_factor)
|
|
||||||
new_width = int(width * size_factor)
|
|
||||||
|
|
||||||
return resize_video(frames, (new_height, new_width))
|
|
||||||
|
|
||||||
|
|
||||||
def sample_frames_from_video(frames: npt.NDArray,
|
|
||||||
num_frames: int) -> npt.NDArray:
|
|
||||||
total_frames = frames.shape[0]
|
|
||||||
if num_frames == -1:
|
|
||||||
return frames
|
|
||||||
else:
|
|
||||||
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
|
||||||
sampled_frames = frames[frame_indices, ...]
|
|
||||||
return sampled_frames
|
|
||||||
|
|
||||||
|
|
||||||
def encode_video_base64(frames: npt.NDArray) -> str:
|
def encode_video_base64(frames: npt.NDArray) -> str:
|
||||||
base64_frames = []
|
base64_frames = []
|
||||||
frames_list = [frames[i] for i in range(frames.shape[0])]
|
frames_list = [frames[i] for i in range(frames.shape[0])]
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from typing import TYPE_CHECKING, Any, Dict, Optional
|
from typing import TYPE_CHECKING, Any, Dict, Optional
|
||||||
|
|
||||||
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import numpy.typing as npt
|
||||||
|
|
||||||
from vllm.inputs.registry import InputContext
|
from vllm.inputs.registry import InputContext
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -75,3 +77,33 @@ class VideoPlugin(ImagePlugin):
|
|||||||
|
|
||||||
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
|
||||||
return 4096
|
return 4096
|
||||||
|
|
||||||
|
|
||||||
|
def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
|
||||||
|
num_frames, _, _, channels = frames.shape
|
||||||
|
new_height, new_width = size
|
||||||
|
resized_frames = np.empty((num_frames, new_height, new_width, channels),
|
||||||
|
dtype=frames.dtype)
|
||||||
|
for i, frame in enumerate(frames):
|
||||||
|
resized_frame = cv2.resize(frame, (new_width, new_height))
|
||||||
|
resized_frames[i] = resized_frame
|
||||||
|
return resized_frames
|
||||||
|
|
||||||
|
|
||||||
|
def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
|
||||||
|
_, height, width, _ = frames.shape
|
||||||
|
new_height = int(height * size_factor)
|
||||||
|
new_width = int(width * size_factor)
|
||||||
|
|
||||||
|
return resize_video(frames, (new_height, new_width))
|
||||||
|
|
||||||
|
|
||||||
|
def sample_frames_from_video(frames: npt.NDArray,
|
||||||
|
num_frames: int) -> npt.NDArray:
|
||||||
|
total_frames = frames.shape[0]
|
||||||
|
if num_frames == -1:
|
||||||
|
return frames
|
||||||
|
|
||||||
|
frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
|
||||||
|
sampled_frames = frames[frame_indices, ...]
|
||||||
|
return sampled_frames
|
||||||
|
|||||||
@ -6,7 +6,12 @@ import tempfile
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import boto3
|
from vllm.utils import PlaceholderModule
|
||||||
|
|
||||||
|
try:
|
||||||
|
import boto3
|
||||||
|
except ImportError:
|
||||||
|
boto3 = PlaceholderModule("boto3") # type: ignore[assignment]
|
||||||
|
|
||||||
|
|
||||||
def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
|
def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
|
||||||
|
|||||||
@ -6,10 +6,12 @@ import datetime
|
|||||||
import enum
|
import enum
|
||||||
import gc
|
import gc
|
||||||
import getpass
|
import getpass
|
||||||
|
import importlib.metadata
|
||||||
import importlib.util
|
import importlib.util
|
||||||
import inspect
|
import inspect
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import signal
|
import signal
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
@ -1557,6 +1559,67 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
|
|||||||
return module
|
return module
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def get_vllm_optional_dependencies():
|
||||||
|
metadata = importlib.metadata.metadata("vllm")
|
||||||
|
requirements = metadata.get_all("Requires-Dist", [])
|
||||||
|
extras = metadata.get_all("Provides-Extra", [])
|
||||||
|
|
||||||
|
return {
|
||||||
|
extra: [
|
||||||
|
re.split(r";|>=|<=|==", req)[0] for req in requirements
|
||||||
|
if req.endswith(f'extra == "{extra}"')
|
||||||
|
]
|
||||||
|
for extra in extras
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PlaceholderModule:
|
||||||
|
"""
|
||||||
|
A placeholder object to use when a module does not exist.
|
||||||
|
|
||||||
|
This enables more informative errors when trying to access attributes
|
||||||
|
of a module that does not exists.
|
||||||
|
"""
|
||||||
|
name: str
|
||||||
|
|
||||||
|
def placeholder_attr(self, attr_path: str):
|
||||||
|
return _PlaceholderModuleAttr(self, attr_path)
|
||||||
|
|
||||||
|
def __getattr__(self, key: str):
|
||||||
|
name = self.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
importlib.import_module(self.name)
|
||||||
|
except ImportError as exc:
|
||||||
|
for extra, names in get_vllm_optional_dependencies().items():
|
||||||
|
if name in names:
|
||||||
|
msg = f"Please install vllm[{extra}] for {extra} support"
|
||||||
|
raise ImportError(msg) from exc
|
||||||
|
|
||||||
|
raise exc
|
||||||
|
|
||||||
|
raise AssertionError("PlaceholderModule should not be used "
|
||||||
|
"when the original module can be imported")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _PlaceholderModuleAttr:
|
||||||
|
module: PlaceholderModule
|
||||||
|
attr_path: str
|
||||||
|
|
||||||
|
def placeholder_attr(self, attr_path: str):
|
||||||
|
return _PlaceholderModuleAttr(self.module,
|
||||||
|
f"{self.attr_path}.{attr_path}")
|
||||||
|
|
||||||
|
def __getattr__(self, key: str):
|
||||||
|
getattr(self.module, f"{self.attr_path}.{key}")
|
||||||
|
|
||||||
|
raise AssertionError("PlaceholderModule should not be used "
|
||||||
|
"when the original module can be imported")
|
||||||
|
|
||||||
|
|
||||||
# create a library to hold the custom op
|
# create a library to hold the custom op
|
||||||
vllm_lib = Library("vllm", "FRAGMENT") # noqa
|
vllm_lib = Library("vllm", "FRAGMENT") # noqa
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Dict
|
from typing import Dict, List, Optional, Set
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -19,3 +19,13 @@ class SamplingMetadata:
|
|||||||
generators: Dict[int, torch.Generator]
|
generators: Dict[int, torch.Generator]
|
||||||
|
|
||||||
max_num_logprobs: int
|
max_num_logprobs: int
|
||||||
|
|
||||||
|
no_penalties: bool
|
||||||
|
prompt_token_ids: Optional[torch.Tensor]
|
||||||
|
frequency_penalties: torch.Tensor
|
||||||
|
presence_penalties: torch.Tensor
|
||||||
|
repetition_penalties: torch.Tensor
|
||||||
|
|
||||||
|
output_token_ids: List[List[int]]
|
||||||
|
min_tokens: List[int]
|
||||||
|
stop_token_ids: List[Set[int]]
|
||||||
|
|||||||
@ -1,9 +1,11 @@
|
|||||||
"""A layer that samples the next tokens from the model's outputs."""
|
"""A layer that samples the next tokens from the model's outputs."""
|
||||||
from typing import Dict
|
from typing import Dict, List, Set, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.utils import apply_penalties
|
||||||
|
from vllm.utils import is_pin_memory_available, make_tensor_with_pad
|
||||||
from vllm.v1.outputs import SamplerOutput
|
from vllm.v1.outputs import SamplerOutput
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
|
|
||||||
@ -17,9 +19,18 @@ class Sampler(nn.Module):
|
|||||||
logits: torch.Tensor,
|
logits: torch.Tensor,
|
||||||
sampling_metadata: SamplingMetadata,
|
sampling_metadata: SamplingMetadata,
|
||||||
) -> SamplerOutput:
|
) -> SamplerOutput:
|
||||||
|
_apply_min_token_penalties(logits, sampling_metadata.output_token_ids,
|
||||||
|
sampling_metadata.stop_token_ids,
|
||||||
|
sampling_metadata.min_tokens)
|
||||||
|
if not sampling_metadata.no_penalties:
|
||||||
|
assert sampling_metadata.prompt_token_ids is not None
|
||||||
|
_apply_penalties(logits, sampling_metadata.prompt_token_ids,
|
||||||
|
sampling_metadata.presence_penalties,
|
||||||
|
sampling_metadata.frequency_penalties,
|
||||||
|
sampling_metadata.repetition_penalties,
|
||||||
|
sampling_metadata.output_token_ids)
|
||||||
logits = self.apply_temperature(logits, sampling_metadata.temperature)
|
logits = self.apply_temperature(logits, sampling_metadata.temperature)
|
||||||
logits = self.apply_top_k_top_p(logits, sampling_metadata)
|
logits = self.apply_top_k_top_p(logits, sampling_metadata)
|
||||||
|
|
||||||
probs = self.get_probs(logits)
|
probs = self.get_probs(logits)
|
||||||
sampled = self.sample(probs, sampling_metadata)
|
sampled = self.sample(probs, sampling_metadata)
|
||||||
# Use int32 to reduce the tensor size.
|
# Use int32 to reduce the tensor size.
|
||||||
@ -157,3 +168,53 @@ def _apply_top_k_top_p(
|
|||||||
# Re-sort the probabilities.
|
# Re-sort the probabilities.
|
||||||
logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
|
logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
|
||||||
return logits
|
return logits
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_min_token_penalties(logits: torch.Tensor,
|
||||||
|
output_token_ids: List[List[int]],
|
||||||
|
stop_token_ids: List[Set[int]],
|
||||||
|
min_tokens: List[int]):
|
||||||
|
"""
|
||||||
|
Applies minimum token penalty by setting the logits of the stop tokens
|
||||||
|
to -inf.
|
||||||
|
"""
|
||||||
|
min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
|
||||||
|
for index, min_token in enumerate(min_tokens):
|
||||||
|
if (len(output_token_ids[index]) < min_token):
|
||||||
|
for stop_token_id in stop_token_ids[index]:
|
||||||
|
min_tokens_logits_to_penalize.append((index, stop_token_id))
|
||||||
|
if min_tokens_logits_to_penalize:
|
||||||
|
logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_penalties(logits: torch.Tensor, prompt_token_ids: torch.Tensor,
|
||||||
|
presence_penalties: torch.Tensor,
|
||||||
|
frequency_penalties: torch.Tensor,
|
||||||
|
repetition_penalties: torch.Tensor,
|
||||||
|
output_token_ids: List[List[int]]):
|
||||||
|
"""
|
||||||
|
Applies presence, frequency and repetition penalties to the logits.
|
||||||
|
"""
|
||||||
|
_, vocab_size = logits.shape
|
||||||
|
output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
|
||||||
|
logits.device)
|
||||||
|
return apply_penalties(logits, prompt_token_ids, output_tokens_t,
|
||||||
|
presence_penalties, frequency_penalties,
|
||||||
|
repetition_penalties)
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
|
||||||
|
device: torch.device) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Convert the different list data structures to tensors.
|
||||||
|
"""
|
||||||
|
output_tokens_tensor = make_tensor_with_pad(
|
||||||
|
output_token_ids,
|
||||||
|
# Use the value of vocab_size as a pad since we don't have a
|
||||||
|
# token_id of this value.
|
||||||
|
pad=vocab_size,
|
||||||
|
device="cpu",
|
||||||
|
dtype=torch.int64,
|
||||||
|
pin_memory=is_pin_memory_available(),
|
||||||
|
)
|
||||||
|
return output_tokens_tensor.to(device, non_blocking=True)
|
||||||
|
|||||||
@ -44,12 +44,14 @@ class InputBatch:
|
|||||||
max_num_blocks_per_req: int,
|
max_num_blocks_per_req: int,
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
pin_memory: bool,
|
pin_memory: bool,
|
||||||
|
vocab_size: int,
|
||||||
):
|
):
|
||||||
self.max_num_reqs = max_num_reqs
|
self.max_num_reqs = max_num_reqs
|
||||||
self.max_model_len = max_model_len
|
self.max_model_len = max_model_len
|
||||||
self.max_num_blocks_per_req = max_num_blocks_per_req
|
self.max_num_blocks_per_req = max_num_blocks_per_req
|
||||||
self.device = device
|
self.device = device
|
||||||
self.pin_memory = pin_memory
|
self.pin_memory = pin_memory
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
|
||||||
self.req_ids: List[Optional[str]] = [None] * max_num_reqs
|
self.req_ids: List[Optional[str]] = [None] * max_num_reqs
|
||||||
self.req_id_to_index: Dict[str, int] = {}
|
self.req_id_to_index: Dict[str, int] = {}
|
||||||
@ -64,6 +66,7 @@ class InputBatch:
|
|||||||
)
|
)
|
||||||
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
|
||||||
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
|
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
|
||||||
|
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
|
||||||
|
|
||||||
# Block table.
|
# Block table.
|
||||||
self.block_table = BlockTable(
|
self.block_table = BlockTable(
|
||||||
@ -106,6 +109,50 @@ class InputBatch:
|
|||||||
self.top_k_cpu = self.top_k_cpu_tensor.numpy()
|
self.top_k_cpu = self.top_k_cpu_tensor.numpy()
|
||||||
self.top_k_reqs: Set[str] = set()
|
self.top_k_reqs: Set[str] = set()
|
||||||
|
|
||||||
|
# Frequency penalty related data structures
|
||||||
|
self.frequency_penalties = torch.empty((max_num_reqs, ),
|
||||||
|
dtype=torch.float,
|
||||||
|
device=device)
|
||||||
|
self.frequency_penalties_cpu_tensor = torch.empty(
|
||||||
|
(max_num_reqs, ),
|
||||||
|
dtype=torch.float,
|
||||||
|
device="cpu",
|
||||||
|
pin_memory=pin_memory)
|
||||||
|
self.frequency_penalties_cpu = \
|
||||||
|
self.frequency_penalties_cpu_tensor.numpy()
|
||||||
|
self.frequency_penalties_reqs: Set[str] = set()
|
||||||
|
|
||||||
|
# Presence penalty related data structures
|
||||||
|
self.presence_penalties = torch.empty((max_num_reqs, ),
|
||||||
|
dtype=torch.float,
|
||||||
|
device=device)
|
||||||
|
self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
|
||||||
|
dtype=torch.float,
|
||||||
|
device="cpu",
|
||||||
|
pin_memory=pin_memory)
|
||||||
|
self.presence_penalties_cpu = \
|
||||||
|
self.presence_penalties_cpu_tensor.numpy()
|
||||||
|
self.presence_penalties_reqs: Set[str] = set()
|
||||||
|
|
||||||
|
# Repetition penalty related data structures
|
||||||
|
self.repetition_penalties = torch.empty((max_num_reqs, ),
|
||||||
|
dtype=torch.float,
|
||||||
|
device=device)
|
||||||
|
self.repetition_penalties_cpu_tensor = torch.empty(
|
||||||
|
(max_num_reqs, ),
|
||||||
|
dtype=torch.float,
|
||||||
|
device="cpu",
|
||||||
|
pin_memory=pin_memory)
|
||||||
|
self.repetition_penalties_cpu = \
|
||||||
|
self.repetition_penalties_cpu_tensor.numpy()
|
||||||
|
self.repetition_penalties_reqs: Set[str] = set()
|
||||||
|
|
||||||
|
self.min_tokens: List[int] = [0] * max_num_reqs
|
||||||
|
self.stop_token_ids: List[Set[int]] = [
|
||||||
|
set() for _ in range(max_num_reqs)
|
||||||
|
]
|
||||||
|
self.prompt_token_ids: Optional[torch.Tensor] = None
|
||||||
|
|
||||||
# req_index -> generator
|
# req_index -> generator
|
||||||
# NOTE(woosuk): The indices of the requests that do not have their own
|
# NOTE(woosuk): The indices of the requests that do not have their own
|
||||||
# generator should not be included in the dictionary.
|
# generator should not be included in the dictionary.
|
||||||
@ -129,6 +176,7 @@ class InputBatch:
|
|||||||
|
|
||||||
# Copy the prompt token ids and output token ids.
|
# Copy the prompt token ids and output token ids.
|
||||||
num_prompt_tokens = len(request.prompt_token_ids)
|
num_prompt_tokens = len(request.prompt_token_ids)
|
||||||
|
self.num_prompt_tokens[req_index] = num_prompt_tokens
|
||||||
self.token_ids_cpu[
|
self.token_ids_cpu[
|
||||||
req_index, :num_prompt_tokens] = request.prompt_token_ids
|
req_index, :num_prompt_tokens] = request.prompt_token_ids
|
||||||
start_idx = num_prompt_tokens
|
start_idx = num_prompt_tokens
|
||||||
@ -152,6 +200,20 @@ class InputBatch:
|
|||||||
self.top_k_cpu[req_index] = sampling_params.top_k
|
self.top_k_cpu[req_index] = sampling_params.top_k
|
||||||
if sampling_params.top_k > 0:
|
if sampling_params.top_k > 0:
|
||||||
self.top_k_reqs.add(req_id)
|
self.top_k_reqs.add(req_id)
|
||||||
|
self.frequency_penalties_cpu[req_index] = \
|
||||||
|
sampling_params.frequency_penalty
|
||||||
|
if sampling_params.frequency_penalty != 0.0:
|
||||||
|
self.frequency_penalties_reqs.add(req_id)
|
||||||
|
self.presence_penalties_cpu[req_index] = \
|
||||||
|
sampling_params.presence_penalty
|
||||||
|
if sampling_params.presence_penalty != 0.0:
|
||||||
|
self.presence_penalties_reqs.add(req_id)
|
||||||
|
self.repetition_penalties_cpu[req_index] = \
|
||||||
|
sampling_params.repetition_penalty
|
||||||
|
if sampling_params.repetition_penalty != 1.0:
|
||||||
|
self.repetition_penalties_reqs.add(req_id)
|
||||||
|
self.min_tokens[req_index] = sampling_params.min_tokens
|
||||||
|
self.stop_token_ids[req_index] = sampling_params.all_stop_token_ids
|
||||||
|
|
||||||
# NOTE(woosuk): self.generators should not include the requests that
|
# NOTE(woosuk): self.generators should not include the requests that
|
||||||
# do not have their own generator.
|
# do not have their own generator.
|
||||||
@ -174,6 +236,9 @@ class InputBatch:
|
|||||||
self.random_reqs.discard(req_id)
|
self.random_reqs.discard(req_id)
|
||||||
self.top_p_reqs.discard(req_id)
|
self.top_p_reqs.discard(req_id)
|
||||||
self.top_k_reqs.discard(req_id)
|
self.top_k_reqs.discard(req_id)
|
||||||
|
self.frequency_penalties_reqs.discard(req_id)
|
||||||
|
self.presence_penalties_reqs.discard(req_id)
|
||||||
|
self.repetition_penalties_reqs.discard(req_id)
|
||||||
self.generators.pop(req_index, None)
|
self.generators.pop(req_index, None)
|
||||||
self.num_logprobs.pop(req_id, None)
|
self.num_logprobs.pop(req_id, None)
|
||||||
self.prompt_logprob_reqs.discard(req_id)
|
self.prompt_logprob_reqs.discard(req_id)
|
||||||
@ -186,6 +251,9 @@ class InputBatch:
|
|||||||
self.random_reqs.clear()
|
self.random_reqs.clear()
|
||||||
self.top_p_reqs.clear()
|
self.top_p_reqs.clear()
|
||||||
self.top_k_reqs.clear()
|
self.top_k_reqs.clear()
|
||||||
|
self.frequency_penalties_reqs.clear()
|
||||||
|
self.presence_penalties_reqs.clear()
|
||||||
|
self.repetition_penalties_reqs.clear()
|
||||||
self.generators.clear()
|
self.generators.clear()
|
||||||
self.num_logprobs.clear()
|
self.num_logprobs.clear()
|
||||||
self.prompt_logprob_reqs.clear()
|
self.prompt_logprob_reqs.clear()
|
||||||
@ -219,6 +287,8 @@ class InputBatch:
|
|||||||
# block_table.
|
# block_table.
|
||||||
self.token_ids_cpu[empty_index] = self.token_ids_cpu[
|
self.token_ids_cpu[empty_index] = self.token_ids_cpu[
|
||||||
last_req_index]
|
last_req_index]
|
||||||
|
self.num_prompt_tokens[empty_index] = \
|
||||||
|
self.num_prompt_tokens[last_req_index]
|
||||||
self.num_computed_tokens_cpu[
|
self.num_computed_tokens_cpu[
|
||||||
empty_index] = self.num_computed_tokens_cpu[last_req_index]
|
empty_index] = self.num_computed_tokens_cpu[last_req_index]
|
||||||
self.block_table.move_row(last_req_index, empty_index)
|
self.block_table.move_row(last_req_index, empty_index)
|
||||||
@ -226,6 +296,15 @@ class InputBatch:
|
|||||||
last_req_index]
|
last_req_index]
|
||||||
self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
|
self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
|
||||||
self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
|
self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
|
||||||
|
self.frequency_penalties_cpu[empty_index] = \
|
||||||
|
self.frequency_penalties_cpu[last_req_index]
|
||||||
|
self.presence_penalties_cpu[empty_index] = \
|
||||||
|
self.presence_penalties_cpu[last_req_index]
|
||||||
|
self.repetition_penalties_cpu[empty_index] = \
|
||||||
|
self.repetition_penalties_cpu[last_req_index]
|
||||||
|
self.min_tokens[empty_index] = self.min_tokens[last_req_index]
|
||||||
|
self.stop_token_ids[empty_index] = \
|
||||||
|
self.stop_token_ids[last_req_index]
|
||||||
generator = self.generators.pop(last_req_index, None)
|
generator = self.generators.pop(last_req_index, None)
|
||||||
if generator is not None:
|
if generator is not None:
|
||||||
self.generators[empty_index] = generator
|
self.generators[empty_index] = generator
|
||||||
@ -235,6 +314,7 @@ class InputBatch:
|
|||||||
|
|
||||||
def make_sampling_metadata(
|
def make_sampling_metadata(
|
||||||
self,
|
self,
|
||||||
|
req_id_output_token_ids: Dict[str, List[int]],
|
||||||
skip_copy: bool = False,
|
skip_copy: bool = False,
|
||||||
) -> SamplingMetadata:
|
) -> SamplingMetadata:
|
||||||
if not skip_copy:
|
if not skip_copy:
|
||||||
@ -244,6 +324,37 @@ class InputBatch:
|
|||||||
self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
|
self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
|
||||||
self.top_k[:self.num_reqs].copy_(
|
self.top_k[:self.num_reqs].copy_(
|
||||||
self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
|
self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
|
||||||
|
if not self.no_penalties:
|
||||||
|
# Since syncing these tensors is expensive only copy them
|
||||||
|
# if necessary i.e. if there are requests which require
|
||||||
|
# penalties to be applied during sampling.
|
||||||
|
self.frequency_penalties[:self.num_reqs].copy_(
|
||||||
|
self.frequency_penalties_cpu_tensor[:self.num_reqs],
|
||||||
|
non_blocking=True)
|
||||||
|
self.presence_penalties[:self.num_reqs].copy_(
|
||||||
|
self.presence_penalties_cpu_tensor[:self.num_reqs],
|
||||||
|
non_blocking=True)
|
||||||
|
self.repetition_penalties[:self.num_reqs].copy_(
|
||||||
|
self.repetition_penalties_cpu_tensor[:self.num_reqs],
|
||||||
|
non_blocking=True)
|
||||||
|
# The prompt tokens are used only for applying penalties during
|
||||||
|
# the sampling process. Hence copy these tensors only when
|
||||||
|
# there are requests which need penalties to be applied.
|
||||||
|
self.prompt_token_ids = self._make_prompt_token_ids_tensor()
|
||||||
|
|
||||||
|
output_token_ids: List[List[int]] = []
|
||||||
|
|
||||||
|
for req_id in self.req_ids[:self.num_reqs]:
|
||||||
|
assert req_id is not None
|
||||||
|
# Currently we create a tensor for output_token_ids from scratch
|
||||||
|
# at each step. However, for the penalties computation what we
|
||||||
|
# need is stats about the token ids present in the output. This
|
||||||
|
# stats can be maintained incrementally instead of computing it
|
||||||
|
# from scratch at each step.
|
||||||
|
# TODO - Replace this with incremental update to output token
|
||||||
|
# statistics.
|
||||||
|
output_token_ids.append(req_id_output_token_ids[req_id])
|
||||||
|
|
||||||
return SamplingMetadata(
|
return SamplingMetadata(
|
||||||
temperature=self.temperature[:self.num_reqs],
|
temperature=self.temperature[:self.num_reqs],
|
||||||
all_greedy=self.all_greedy,
|
all_greedy=self.all_greedy,
|
||||||
@ -254,8 +365,33 @@ class InputBatch:
|
|||||||
no_top_k=self.no_top_k,
|
no_top_k=self.no_top_k,
|
||||||
generators=self.generators,
|
generators=self.generators,
|
||||||
max_num_logprobs=self.max_num_logprobs,
|
max_num_logprobs=self.max_num_logprobs,
|
||||||
|
prompt_token_ids=self.prompt_token_ids,
|
||||||
|
frequency_penalties=self.frequency_penalties[:self.num_reqs],
|
||||||
|
presence_penalties=self.presence_penalties[:self.num_reqs],
|
||||||
|
repetition_penalties=self.repetition_penalties[:self.num_reqs],
|
||||||
|
output_token_ids=output_token_ids,
|
||||||
|
min_tokens=self.min_tokens[:self.num_reqs],
|
||||||
|
stop_token_ids=self.stop_token_ids[:self.num_reqs],
|
||||||
|
no_penalties=self.no_penalties,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
|
||||||
|
max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
|
||||||
|
prompt_token_ids_cpu_tensor = torch.empty(
|
||||||
|
(self.num_reqs, max_prompt_len),
|
||||||
|
device="cpu",
|
||||||
|
dtype=torch.int64,
|
||||||
|
pin_memory=self.pin_memory)
|
||||||
|
prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
|
||||||
|
prompt_token_ids[:] = (
|
||||||
|
self.token_ids_cpu[:self.num_reqs, :max_prompt_len])
|
||||||
|
# Use the value of vocab_size as a pad since we don't have a
|
||||||
|
# token_id of this value.
|
||||||
|
for i in range(self.num_reqs):
|
||||||
|
prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
|
||||||
|
return prompt_token_ids_cpu_tensor.to(device=self.device,
|
||||||
|
non_blocking=True)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_reqs(self) -> int:
|
def num_reqs(self) -> int:
|
||||||
return len(self.req_id_to_index)
|
return len(self.req_id_to_index)
|
||||||
@ -276,6 +412,12 @@ class InputBatch:
|
|||||||
def no_top_k(self) -> bool:
|
def no_top_k(self) -> bool:
|
||||||
return len(self.top_k_reqs) == 0
|
return len(self.top_k_reqs) == 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def no_penalties(self) -> bool:
|
||||||
|
return (len(self.presence_penalties_reqs) == 0
|
||||||
|
and len(self.frequency_penalties_reqs) == 0
|
||||||
|
and len(self.repetition_penalties_reqs) == 0)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def max_num_logprobs(self) -> int:
|
def max_num_logprobs(self) -> int:
|
||||||
return max(self.num_logprobs.values()) if self.num_logprobs else 0
|
return max(self.num_logprobs.values()) if self.num_logprobs else 0
|
||||||
|
|||||||
@ -105,6 +105,7 @@ class GPUModelRunner:
|
|||||||
max_num_blocks_per_req=self.max_num_blocks_per_req,
|
max_num_blocks_per_req=self.max_num_blocks_per_req,
|
||||||
device=self.device,
|
device=self.device,
|
||||||
pin_memory=self.pin_memory,
|
pin_memory=self.pin_memory,
|
||||||
|
vocab_size=model_config.get_vocab_size(),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.use_cuda_graph = (self.vllm_config.compilation_config.level
|
self.use_cuda_graph = (self.vllm_config.compilation_config.level
|
||||||
@ -383,7 +384,12 @@ class GPUModelRunner:
|
|||||||
or scheduler_output.scheduled_resumed_reqs):
|
or scheduler_output.scheduled_resumed_reqs):
|
||||||
skip_copy = False
|
skip_copy = False
|
||||||
# Create the sampling metadata.
|
# Create the sampling metadata.
|
||||||
sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
|
req_id_output_token_ids: Dict[str, List[int]] = \
|
||||||
|
{req_id: req.output_token_ids \
|
||||||
|
for req_id, req in self.requests.items()}
|
||||||
|
|
||||||
|
sampling_metadata = self.input_batch.make_sampling_metadata(
|
||||||
|
req_id_output_token_ids, skip_copy)
|
||||||
return sampling_metadata
|
return sampling_metadata
|
||||||
|
|
||||||
def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
|
def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user