From 51a624bf024e351e678b598521b72a2e19b5e2ef Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 26 Dec 2024 12:23:20 +0800 Subject: [PATCH 1/8] [Misc] Move some multimodal utils to modality-specific modules (#11494) Signed-off-by: DarkLight1337 --- .../decoder_only/vision_language/test_awq.py | 2 +- .../vision_language/test_h2ovl.py | 2 +- .../vision_language/test_phi3v.py | 2 +- .../vision_language/test_qwen2_vl.py | 4 +- .../vision_language/vlm_utils/builders.py | 5 +- .../vlm_utils/custom_inputs.py | 5 +- .../vision_language/test_mllama.py | 2 +- tests/multimodal/test_mapper.py | 2 +- vllm/assets/video.py | 2 +- vllm/multimodal/audio.py | 12 ++++ vllm/multimodal/image.py | 12 ++++ vllm/multimodal/utils.py | 68 +------------------ vllm/multimodal/video.py | 43 ++++++++++++ 13 files changed, 84 insertions(+), 77 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py index 6e6e5b40d6a35..18ceb34a4e042 100644 --- a/tests/models/decoder_only/vision_language/test_awq.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -3,7 +3,7 @@ from typing import List, Optional, Type import pytest import torch -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ...utils import check_logprobs_close diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py index 45a7365204403..7406df253e7f0 100644 --- a/tests/models/decoder_only/vision_language/test_h2ovl.py +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -8,7 +8,7 @@ from transformers import AutoConfig # Import the functions to test from vllm.model_executor.models.h2ovl import (calculate_num_blocks, image_to_pixel_values_wrapper) -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size models = [ "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index 82eae0705c9ba..3a8934adfb076 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -5,7 +5,7 @@ from typing import List, Optional, Tuple, Type import pytest from transformers import AutoTokenizer -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 71b6ba4dca435..51fe7d2ad32a8 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -6,8 +6,8 @@ import torch from PIL import Image from vllm.entrypoints.llm import LLM -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import rescale_video_size, sample_frames_from_video from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py index 66668296139f5..59773be709fa8 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py @@ -5,8 +5,9 @@ from typing import Callable, Iterable, List, Optional, Tuple, Union import torch -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - resize_video, sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import (rescale_video_size, resize_video, + sample_frames_from_video) from .....conftest import _ImageAssets, _VideoAssets from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py index e698d8d3f6f56..2291f4fa0d0ac 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py @@ -1,8 +1,9 @@ """Custom input builders for edge-cases in different models.""" from typing import Callable -from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, - resize_video, sample_frames_from_video) +from vllm.multimodal.image import rescale_image_size +from vllm.multimodal.video import (rescale_video_size, resize_video, + sample_frames_from_video) from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS from .builders import build_multi_image_inputs, build_single_image_inputs diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 77dd1d81f84d7..636a3eedff31b 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -6,7 +6,7 @@ from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size from vllm.sequence import SampleLogprobs from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 71832acbd17b8..81f2a06182bcc 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -6,7 +6,7 @@ from transformers import LlavaNextImageProcessor from vllm.config import ModelConfig from vllm.multimodal import MultiModalRegistry -from vllm.multimodal.utils import rescale_image_size +from vllm.multimodal.image import rescale_image_size @pytest.fixture diff --git a/vllm/assets/video.py b/vllm/assets/video.py index e4dcab10466db..e6779935bad17 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -7,7 +7,7 @@ import numpy.typing as npt from huggingface_hub import hf_hub_download from PIL import Image -from vllm.multimodal.utils import (sample_frames_from_video, +from vllm.multimodal.video import (sample_frames_from_video, try_import_video_packages) from .base import get_cache_dir diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index c92deddbcb255..314d21b746236 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,3 +1,5 @@ +from typing import Any + import numpy as np import numpy.typing as npt @@ -26,6 +28,16 @@ class AudioPlugin(MultiModalPlugin): "There is no default maximum multimodal tokens") +def try_import_audio_packages() -> tuple[Any, Any]: + try: + import librosa + import soundfile + except ImportError as exc: + raise ImportError( + "Please install vllm[audio] for audio support.") from exc + return librosa, soundfile + + def resample_audio( audio: npt.NDArray[np.floating], *, diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 97bbce1ce1570..c705e1a3d1554 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin): def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 3000 + + +def rescale_image_size(image: Image.Image, + size_factor: float, + transpose: int = -1) -> Image.Image: + """Rescale the dimensions of an image by a constant factor.""" + new_width = int(image.width * size_factor) + new_height = int(image.height * size_factor) + image = image.resize((new_width, new_height)) + if transpose >= 0: + image = image.transpose(Image.Transpose(transpose)) + return image diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index be9643598448d..1cb9036bdfda3 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -2,7 +2,7 @@ import base64 import os from functools import lru_cache from io import BytesIO -from typing import Any, List, Optional, Tuple, TypeVar, Union +from typing import List, Optional, Tuple, TypeVar, Union import numpy as np import numpy.typing as npt @@ -14,7 +14,9 @@ from vllm.connections import global_http_connection from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from .audio import try_import_audio_packages from .inputs import MultiModalDataDict, PlaceholderRange +from .video import try_import_video_packages logger = init_logger(__name__) @@ -198,16 +200,6 @@ async def async_fetch_video(video_url: str, return video -def try_import_audio_packages() -> Tuple[Any, Any]: - try: - import librosa - import soundfile - except ImportError as exc: - raise ImportError( - "Please install vllm[audio] for audio support.") from exc - return librosa, soundfile - - def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: """ Load audio from a URL. @@ -324,60 +316,6 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: return _load_image_from_bytes(base64.b64decode(image)) -def rescale_image_size(image: Image.Image, - size_factor: float, - transpose: int = -1) -> Image.Image: - """Rescale the dimensions of an image by a constant factor.""" - new_width = int(image.width * size_factor) - new_height = int(image.height * size_factor) - image = image.resize((new_width, new_height)) - if transpose >= 0: - image = image.transpose(Image.Transpose(transpose)) - return image - - -def try_import_video_packages(): - try: - import cv2 - import decord - except ImportError as exc: - raise ImportError( - "Please install vllm[video] for video support.") from exc - return cv2, decord - - -def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray: - cv2, _ = try_import_video_packages() - - num_frames, _, _, channels = frames.shape - new_height, new_width = size - resized_frames = np.empty((num_frames, new_height, new_width, channels), - dtype=frames.dtype) - for i, frame in enumerate(frames): - resized_frame = cv2.resize(frame, (new_width, new_height)) - resized_frames[i] = resized_frame - return resized_frames - - -def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: - _, height, width, _ = frames.shape - new_height = int(height * size_factor) - new_width = int(width * size_factor) - - return resize_video(frames, (new_height, new_width)) - - -def sample_frames_from_video(frames: npt.NDArray, - num_frames: int) -> npt.NDArray: - total_frames = frames.shape[0] - if num_frames == -1: - return frames - else: - frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) - sampled_frames = frames[frame_indices, ...] - return sampled_frames - - def encode_video_base64(frames: npt.NDArray) -> str: base64_frames = [] frames_list = [frames[i] for i in range(frames.shape[0])] diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index ba9bf58a4a20c..bfcdef70718bc 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -2,6 +2,7 @@ from functools import lru_cache from typing import TYPE_CHECKING, Any, Dict, Optional import numpy as np +import numpy.typing as npt from vllm.inputs.registry import InputContext from vllm.logger import init_logger @@ -75,3 +76,45 @@ class VideoPlugin(ImagePlugin): def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: return 4096 + + +def try_import_video_packages() -> tuple[Any, Any]: + try: + import cv2 + import decord + except ImportError as exc: + raise ImportError( + "Please install vllm[video] for video support.") from exc + return cv2, decord + + +def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: + cv2, _ = try_import_video_packages() + + num_frames, _, _, channels = frames.shape + new_height, new_width = size + resized_frames = np.empty((num_frames, new_height, new_width, channels), + dtype=frames.dtype) + for i, frame in enumerate(frames): + resized_frame = cv2.resize(frame, (new_width, new_height)) + resized_frames[i] = resized_frame + return resized_frames + + +def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: + _, height, width, _ = frames.shape + new_height = int(height * size_factor) + new_width = int(width * size_factor) + + return resize_video(frames, (new_height, new_width)) + + +def sample_frames_from_video(frames: npt.NDArray, + num_frames: int) -> npt.NDArray: + total_frames = frames.shape[0] + if num_frames == -1: + return frames + + frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) + sampled_frames = frames[frame_indices, ...] + return sampled_frames From dbeac95dbbf898bcc0965528fc767e9cadbbe0c5 Mon Sep 17 00:00:00 2001 From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com> Date: Wed, 25 Dec 2024 23:04:07 -0600 Subject: [PATCH 2/8] Mypy checking for vllm/compilation (#11496) Signed-off-by: lucast2021 Co-authored-by: lucast2021 --- vllm/compilation/backends.py | 10 +++++----- vllm/compilation/multi_output_match.py | 3 ++- vllm/compilation/pass_manager.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 0c7bbfe599b02..826d1744d88a5 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -141,14 +141,14 @@ class AlwaysHitShapeEnv: return "" -def wrap_inductor(graph, +def wrap_inductor(graph: fx.GraphModule, example_inputs, additional_inductor_config, compilation_config: CompilationConfig, graph_index: int = 0, num_graphs: int = 1, runtime_shape: Optional[int] = None, - use_inductor: bool = True): + use_inductor: bool = True) -> Any: if graph_index == 0: # before compiling the first graph, record the start time global compilation_start_time @@ -209,7 +209,7 @@ def wrap_inductor(graph, returns_tuple = graph_returns_tuple(graph) # this is the graph we return to Dynamo to run - def compiled_graph(*args): + def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]: # convert args to list list_args = list(args) graph_output = inductor_compiled_graph(list_args) @@ -247,7 +247,7 @@ def wrap_inductor(graph, # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa return - def _get_shape_env(): + def _get_shape_env() -> AlwaysHitShapeEnv: return AlwaysHitShapeEnv() with patch(# for hijacking the hash of the compiled graph @@ -537,7 +537,7 @@ class VllmBackend: example_inputs[x].clone() for x in self.sym_tensor_indices ] - def copy_and_call(*args): + def copy_and_call(*args) -> fx.GraphModule: list_args = list(args) for i, index in enumerate(self.sym_tensor_indices): runtime_tensor = list_args[index] diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py index 0ad648abfbb3a..b6bcecdc89e26 100644 --- a/vllm/compilation/multi_output_match.py +++ b/vllm/compilation/multi_output_match.py @@ -7,6 +7,7 @@ from torch import fx from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._inductor import pattern_matcher as pm from torch._ops import OpOverload +from torch.fx import Node from vllm.compilation.fx_utils import find_auto_fn @@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC): self.graph.call_function(operator.getitem, (tuple_node, idx)) for idx in indices) - def insert_auto_fn(self, op: OpOverload, kwargs): + def insert_auto_fn(self, op: OpOverload, kwargs) -> Node: """ Insert an auto_functionalized node with the given op and kwargs. """ diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index fb522ae053e97..34f5f355798b2 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Any, Dict, List from torch import fx as fx @@ -53,7 +53,7 @@ class PostGradPassManager: assert isinstance(pass_, InductorPass) self.passes.append(pass_) - def __getstate__(self): + def __getstate__(self) -> Dict[str, List[Any]]: """ Custom pickling for the pass manager, as some passes cannot be pickled. Pickling occurs because the pass manager is set as the value of From aa25985bd1e7a4925a7061fdfbc93893b492627b Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 26 Dec 2024 15:52:48 +0800 Subject: [PATCH 3/8] [Misc][LoRA] Fix LoRA weight mapper (#11495) Signed-off-by: Jee Jee Li --- tests/lora/test_lora_checkpoints.py | 15 +++++++++----- tests/lora/test_qwen2vl.py | 6 +++++- vllm/lora/models.py | 3 ++- vllm/lora/utils.py | 32 ++++++++++------------------- vllm/lora/worker_manager.py | 2 ++ 5 files changed, 30 insertions(+), 28 deletions(-) diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 9842203eb15e0..537d95b025a9d 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -74,7 +74,7 @@ def test_load_checkpoints( embedding_padding_modules=embed_padding_modules) -def test_lora_weights_mapping(baichuan_lora_files, ): +def test_lora_weights_mapping(baichuan_lora_files): supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping embedding_modules = BaiChuanBaseForCausalLM.embedding_modules @@ -86,10 +86,14 @@ def test_lora_weights_mapping(baichuan_lora_files, ): else: expected_lora_modules.append(module) - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ - "model.": "language_model.model.", - }, ) - + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.": "language_model.model.", + }, + orig_to_new_substr={ + ".layers.": ".baichuan_layers.", + }, + ) lora_model = LoRAModel.from_local_checkpoint( baichuan_lora_files, expected_lora_modules, @@ -101,3 +105,4 @@ def test_lora_weights_mapping(baichuan_lora_files, ): ) for name in lora_model.loras: assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."]) + assert ".baichuan_layers." in name diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index c8c720ff0c776..c9f48402b0268 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -22,7 +22,7 @@ IMAGE_ASSETS = [ # After fine-tuning with LoRA, all generated content should start begin `A`. EXPECTED_OUTPUT = [ - "A stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501 + "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501 "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501 ] @@ -76,3 +76,7 @@ def test_qwen2vl_lora(qwen2vl_lora_files): output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output1[i]) + + output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2) + for i in range(len(EXPECTED_OUTPUT)): + assert EXPECTED_OUTPUT[i].startswith(output2[i]) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index f50db8e3b8e10..5c0e4e5cbc636 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -231,7 +231,8 @@ class LoRAModel(AdapterModel): with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore for lora_module in f.keys(): # noqa - module_name, _, _ = parse_fine_tuned_lora_name(lora_module) + module_name, _, _ = parse_fine_tuned_lora_name( + lora_module, weights_mapper) part_name = module_name.split(".")[-1] if part_name not in expected_lora_modules: unexpected_modules.append(module_name) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 3a84a6ae1c02a..d72b7638d84af 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,4 +1,3 @@ -import copy import os import re from typing import List, Optional, Set, Tuple, Type, Union @@ -32,7 +31,6 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.models.utils import WeightsMapper -from vllm.utils import print_warning_once logger = init_logger(__name__) @@ -112,36 +110,28 @@ def parse_fine_tuned_lora_name( is_bias whether the tensor is lora bias. """ - w_mapper = None - if weights_mapper: - w_mapper = copy.deepcopy(weights_mapper) - # TODO: Currently only supports mapping for prefix, mapping for - # substr and subfix will be supported in the future. - for attr, mapping in [ - ("orig_to_new_substr", w_mapper.orig_to_new_substr), - ("orig_to_new_suffix", w_mapper.orig_to_new_suffix), - ]: - if mapping: - print_warning_once( - f"vLLM currently does not support mapping of LoRA weights " - f"for {mapping}.") - setattr(w_mapper, attr, {}) + # LoRA weight qualified name always starts with `base_model.model.`, + # so we remove the prefix `base_model.model.` to make the following + # mapping correctly. + if "base_model.model." in name: + name = name.replace("base_model.model.", "") + name = weights_mapper._map_name(name) if weights_mapper else name + # recover the prefix `base_model.model.` + name = "base_model.model." + name - mapper = (lambda name: w_mapper._map_name(name) - if w_mapper is not None else name) parts = name.split(".") if parts[-1] == "weight" and (parts[-2] == "lora_A" or parts[-2] == "lora_B"): new_name = ".".join(parts[2:-2]) - return mapper(new_name), parts[-2] == "lora_A", False + return new_name, parts[-2] == "lora_A", False if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": new_name = ".".join(parts[2:-1]) - return mapper(new_name), parts[-1] == "lora_embedding_A", False + return new_name, parts[-1] == "lora_embedding_A", False if parts[-1] == "bias": new_name = ".".join(parts[2:-2]) - return mapper(new_name), False, True + return new_name, False, True raise ValueError(f"{name} is unsupported LoRA weight") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index ef8cc5886103e..10976fac23028 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -91,6 +91,8 @@ class WorkerLoRAManager(AbstractWorkerManager): packed_modules_mapping[module]) else: expected_lora_modules.append(module) + + expected_lora_modules = list(set(expected_lora_modules)) lora_path = get_adapter_absolute_path(lora_request.lora_path) # For some models like Qwen2VL, we need to use hf_to_vllm_mapper From 7492a362077ace26b4f0374a39ba5b0846962b87 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Thu, 26 Dec 2024 01:44:32 -0800 Subject: [PATCH 4/8] [Doc] Add `QVQ` and `QwQ` to the list of supported models (#11509) Signed-off-by: Roger Wang Co-authored-by: Cyrus Leung --- docs/source/models/supported_models.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 099e6c8f02815..85fba83147708 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -319,7 +319,7 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ * - :code:`Qwen2ForCausalLM` - Qwen2 - - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. + - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. - ✅︎ - ✅︎ * - :code:`Qwen2MoeForCausalLM` @@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - T + I\ :sup:`E+` + V\ :sup:`E+` - - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. + - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ - From dcb1a944d4cf95b4a7b3522ddf970e6d3a28b8b5 Mon Sep 17 00:00:00 2001 From: sroy745 <142070531+sroy745@users.noreply.github.com> Date: Thu, 26 Dec 2024 02:02:58 -0800 Subject: [PATCH 5/8] [V1] Adding min tokens/repetition/presence/frequence penalties to V1 sampler (#10681) Signed-off-by: Sourashis Roy Signed-off-by: Woosuk Kwon Co-authored-by: Woosuk Kwon --- tests/v1/engine/test_engine_core.py | 38 +++ tests/v1/sample/__init__.py | 0 tests/v1/sample/test_sampler.py | 331 ++++++++++++++++++++++++ tests/v1/worker/__init__.py | 0 tests/v1/worker/test_gpu_input_batch.py | 224 ++++++++++++++++ vllm/model_executor/layers/sampler.py | 51 +--- vllm/model_executor/layers/utils.py | 57 ++++ vllm/v1/sample/metadata.py | 12 +- vllm/v1/sample/sampler.py | 65 ++++- vllm/v1/worker/gpu_input_batch.py | 142 ++++++++++ vllm/v1/worker/gpu_model_runner.py | 8 +- 11 files changed, 879 insertions(+), 49 deletions(-) create mode 100644 tests/v1/sample/__init__.py create mode 100644 tests/v1/sample/test_sampler.py create mode 100644 tests/v1/worker/__init__.py create mode 100644 tests/v1/worker/test_gpu_input_batch.py create mode 100644 vllm/model_executor/layers/utils.py diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index a61ec63a365b5..c529cd21f384b 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -139,3 +139,41 @@ def test_engine_core(monkeypatch): engine_core.abort_requests([req2.request_id, req0.request_id]) assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.running) == 0 + + +def test_engine_core_advanced_sampling(monkeypatch): + """ + A basic end-to-end test to verify that the engine functions correctly + when additional sampling parameters, such as min_tokens and + presence_penalty, are set. + """ + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + """Setup the EngineCore.""" + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class = AsyncLLM._get_executor_cls(vllm_config) + + engine_core = EngineCore(vllm_config=vllm_config, + executor_class=executor_class, + usage_context=UsageContext.UNKNOWN_CONTEXT) + """Test basic request lifecycle.""" + # First request. + request: EngineCoreRequest = make_request() + request.sampling_params = SamplingParams( + min_tokens=4, + presence_penalty=1.0, + frequency_penalty=1.0, + repetition_penalty=0.1, + stop_token_ids=[1001, 1002], + ) + engine_core.add_request(request) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + # Loop through until they are all done. + while len(engine_core.step()) > 0: + pass + + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 diff --git a/tests/v1/sample/__init__.py b/tests/v1/sample/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py new file mode 100644 index 0000000000000..d8d055805cbea --- /dev/null +++ b/tests/v1/sample/test_sampler.py @@ -0,0 +1,331 @@ +from typing import List, Set, Tuple + +import numpy as np +import pytest +import torch + +from vllm.utils import make_tensor_with_pad +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.sampler import Sampler + +VOCAB_SIZE = 1024 +NUM_OUTPUT_TOKENS = 20 +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] +MAX_NUM_PROMPT_TOKENS = 64 + + +def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor: + fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float) + return fake_logits + + +def _create_penalty_tensor(batch_size: int, penalty_value: float, + device: torch.device) -> torch.Tensor: + return torch.full((batch_size, ), + fill_value=penalty_value, + dtype=torch.float, + device=device) + + +def _create_prompt_tokens_tensor( + prompt_token_ids: List[List[int]], + vocab_size: int, + device: torch.device, +) -> torch.Tensor: + return make_tensor_with_pad( + prompt_token_ids, + pad=vocab_size, + device=device, + dtype=torch.int64, + pin_memory=False, + ) + + +def _create_default_sampling_metadata( + num_output_tokens: int, + batch_size: int, + vocab_size: int, + device: torch.device, +) -> SamplingMetadata: + output_token_ids: List[List[int]] = [] + prompt_token_ids: List[List[int]] = [] + for _ in range(batch_size): + output_token_ids.append( + np.random.randint(0, vocab_size, size=num_output_tokens).tolist()) + prompt_token_ids.append( + np.random.randint(0, + vocab_size, + size=np.random.randint( + 1, MAX_NUM_PROMPT_TOKENS)).tolist()) + fake_sampling_metadata = SamplingMetadata( + temperature=torch.full((batch_size, ), 0.0), + all_greedy=True, + all_random=False, + top_p=torch.empty(batch_size, ), + top_k=torch.empty(batch_size, ), + no_top_p=True, + no_top_k=True, + generators={}, + max_num_logprobs=VOCAB_SIZE, + prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids, + vocab_size, device), + output_token_ids=output_token_ids, + frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device), + presence_penalties=_create_penalty_tensor(batch_size, 0.0, device), + repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device), + no_penalties=True, + min_tokens=[], + stop_token_ids=[], + ) + return fake_sampling_metadata + + +def _generate_min_token_penalties_and_stop_tokens( + num_output_tokens: int, batch_size: int, vocab_size: int, + batch_indices_for_min_token_penalty: List[int] +) -> Tuple[List[int], List[Set[int]]]: + """ + Generates and returns a list of minimum token penalties (`min_tokens`) + and a corresponding list of stop token IDs (`stop_token_ids`) for each + batch. + + If a batch index is included in `batch_indices_for_min_token_penalty`, + a higher `min_tokens` value is assigned (within a randomized range), + and a random set of stop token IDs is created. Otherwise, a lower + `min_tokens` value is assigned, and the stop token IDs set is empty. + """ + stop_token_ids: List[Set[int]] = [] + min_tokens: List[int] = [] + for index in range(batch_size): + if index in batch_indices_for_min_token_penalty: + min_tokens.append( + np.random.randint(num_output_tokens + 1, + 2 * num_output_tokens)) + stop_token_ids.append( + set( + np.random.randint(0, vocab_size - 1) + for _ in range(np.random.randint(0, vocab_size)))) + + else: + min_tokens.append(np.random.randint(0, num_output_tokens)) + stop_token_ids.append(set()) + return (min_tokens, stop_token_ids) + + +def _create_weighted_output_token_list( + batch_size: int, + vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]: + """ + Creates an output token list where each token occurs a distinct + number of times. + + For each batch, a random subset of token IDs is selected from the + vocabulary. The selected tokens are then added to the output token + list, each with a different frequency. + + Returns: + Tuple[List[List[int]], List[List[int]]]: + - The first element is the output token list, where each sublist + corresponds to a batch and contains tokens with weighted + frequencies. + - The second element is a list of distinct token IDs for each + batch, ordered by their frequency in the corresponding output + list. + """ + output_token_ids: List[List[int]] = [] + sorted_token_ids_in_output: List[List[int]] = [] + for _ in range(batch_size): + distinct_token_ids = np.random.choice(vocab_size, + size=np.random.randint(1, 10), + replace=False).tolist() + sorted_token_ids_in_output.append(distinct_token_ids) + output_token_ids_for_batch = [] + for index, token_id in enumerate(distinct_token_ids): + output_token_ids_for_batch.extend( + [token_id for _ in range(index + 1)]) + output_token_ids.append(output_token_ids_for_batch) + return (output_token_ids, sorted_token_ids_in_output) + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2, 32]) +def test_sampler_min_tokens_penalty(device: str, batch_size: int): + """ + Tests that if the number of output tokens is less than + SamplingParams.min_tokens then we will set the logits for + the stop token ids to -inf. + """ + torch.set_default_device(device) + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + batch_indices_for_min_token_penalty = np.random.randint( + 0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist() + min_tokens, stop_token_ids = _generate_min_token_penalties_and_stop_tokens( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, + batch_indices_for_min_token_penalty) + sampling_metadata.min_tokens = min_tokens + sampling_metadata.stop_token_ids = stop_token_ids + sampler = Sampler() + sampler_output = sampler(fake_logits, sampling_metadata) + for batch_idx in range(batch_size): + for vocab in range(VOCAB_SIZE): + # Verify that the logprobs for stop token ids is set + # to -inf. + logprob_index = torch.where( + sampler_output.logprob_token_ids[batch_idx] == + vocab)[0].item() + if vocab in stop_token_ids[batch_idx]: + assert sampler_output.logprobs[batch_idx][ + logprob_index] == -float("inf") + else: + assert sampler_output.logprobs[batch_idx][ + logprob_index] != -float("inf") + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2, 32]) +@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0]) +def test_sampler_presence_penalty(device: str, batch_size: int, + presence_penalty: float): + """ + Test to verify that if presence penalty is enabled then tokens + are penalized as per their presence in the existing output. + """ + torch.set_default_device(device) + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + output_token_ids = sampling_metadata.output_token_ids + sampling_metadata.presence_penalties = _create_penalty_tensor( + batch_size, presence_penalty, torch.device(device)) + sampling_metadata.no_penalties = False + sampler = Sampler() + sampler_output = sampler(fake_logits, sampling_metadata) + for batch_idx in range(batch_size): + # The logprobs in the SamplerOutput are arranged in descending order. + # Since all tokens initially have the same logprobs, the non-penalized + # tokens will appear at the beginning, while the penalized tokens + # will appear at the end of the list. + penalized_token_id = sampler_output.logprob_token_ids[batch_idx][ + VOCAB_SIZE - 1] + penalized_log_prod = sampler_output.logprobs[batch_idx][VOCAB_SIZE - 1] + non_penalized_token_id = sampler_output.logprob_token_ids[batch_idx][0] + non_penalized_log_prod = sampler_output.logprobs[batch_idx][0] + assert non_penalized_log_prod > penalized_log_prod + if presence_penalty > 0: + # If `presence_penalty` is set to a value greater than 0, it + # indicates a preference for new tokens over those already + # present in the output. + # Verify that the penalized token ID exists in the output, while the + # non-penalized token ID does not. + assert penalized_token_id in output_token_ids[batch_idx] + assert non_penalized_token_id not in output_token_ids[batch_idx] + elif presence_penalty < 0: + # If `presence_penalty` is set to a value less than 0, it indicates + # a preference for existing tokens over new ones. Verify that the + # non-penalized token ID exists in the output, while the penalized + # token ID does not. + assert non_penalized_token_id in output_token_ids[batch_idx] + assert penalized_token_id not in output_token_ids[batch_idx] + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2, 32]) +@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0]) +def test_sampler_frequency_penalty(device: str, batch_size: int, + frequency_penalty: float): + """ + Test to verify that if frequency penalty is enabled then tokens are + penalized as per their frequency of occurrence. + """ + torch.set_default_device(device) + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + sampling_metadata.frequency_penalties = _create_penalty_tensor( + batch_size, frequency_penalty, torch.device(device)) + output_token_ids, sorted_token_ids_in_output = \ + _create_weighted_output_token_list(batch_size, VOCAB_SIZE) + sampling_metadata.output_token_ids = output_token_ids + sampling_metadata.no_penalties = False + sampler = Sampler() + sampler_output = sampler(fake_logits, sampling_metadata) + for batch_idx in range(batch_size): + logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx] + non_penalized_token_id = logprobs_token_ids[0] + penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1] + distinct_sorted_token_ids_in_output = \ + sorted_token_ids_in_output[batch_idx] + most_frequent_token_id = distinct_sorted_token_ids_in_output[ + len(distinct_sorted_token_ids_in_output) - 1] + if frequency_penalty > 0: + # If `frequency_penalty` is set to > 0, it indicates + # a preference for new tokens over existing ones. Verify that the + # non-penalized token ID is not present in the output, while the + # most penalized token is the one that occurs most frequently in + # the output. + assert non_penalized_token_id \ + not in distinct_sorted_token_ids_in_output + assert penalized_token_id == most_frequent_token_id + elif frequency_penalty < 0: + # If `frequency_penalty` is set to < 0, it indicates + # a preference for existing tokens over new ones. Verify that the + # non-penalized token ID is the one that occurs most frequently + # in the output, while the penalized token ID is one that has not + # yet appeared. + assert non_penalized_token_id == most_frequent_token_id + assert penalized_token_id \ + not in distinct_sorted_token_ids_in_output + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2, 32]) +@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9]) +def test_sampler_repetition_penalty(device: str, batch_size: int, + repetition_penalty: float): + """ + Test to verify that when the repetition penalty is enabled, tokens + are penalized based on their presence in the prompt or the existing + output. + """ + torch.set_default_device(device) + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + sampling_metadata.repetition_penalties = _create_penalty_tensor( + batch_size, repetition_penalty, torch.device(device)) + sampling_metadata.no_penalties = False + sampler = Sampler() + sampler_output = sampler(fake_logits, sampling_metadata) + for batch_idx in range(batch_size): + logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx] + non_penalized_token_id = logprobs_token_ids[0] + penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1] + prompt_tokens = sampling_metadata.prompt_token_ids[ + batch_idx][:].tolist() + output_tokens = sampling_metadata.output_token_ids[batch_idx] + if repetition_penalty > 1.0: + # If `repetition_penalty` > 1.0, verify that the non-penalized + # token ID has not been seen before, while the penalized token ID + # exists either in the prompt or the output. + assert (non_penalized_token_id not in prompt_tokens and \ + non_penalized_token_id not in output_tokens) + assert (penalized_token_id in prompt_tokens or \ + penalized_token_id in output_tokens) + elif repetition_penalty < 1.0: + # If `repetition_penalty` < 1.0, verify that the penalized + # token ID has not been seen before, while the non-penalized + # token ID exists either in the prompt or the output. + assert (penalized_token_id not in prompt_tokens and \ + penalized_token_id not in output_tokens) + assert (non_penalized_token_id in prompt_tokens or \ + non_penalized_token_id in output_tokens) diff --git a/tests/v1/worker/__init__.py b/tests/v1/worker/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py new file mode 100644 index 0000000000000..694ce81ff6e22 --- /dev/null +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -0,0 +1,224 @@ +from typing import Dict, List, Set, Tuple + +import numpy as np +import pytest +import torch + +from vllm.sampling_params import SamplingParams +from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch + +VOCAB_SIZE = 1024 +NUM_OUTPUT_TOKENS = 20 +MAX_PROMPT_SIZE = 100 +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] +MAX_NUM_PROMPT_TOKENS = 64 + + +def _remove_requests( + input_batch: InputBatch, batch_size: int, + reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]: + """ + Remove some requests randomly from the batch and returns a Tuple + of 1) set of request removed 2) indices of the requests removed + ordered in descending order + """ + + num_reqs_to_remove = np.random.randint(0, batch_size) + req_indices_to_remove: Set[int] = set() + for _ in range(num_reqs_to_remove): + req_index_to_remove = np.random.randint(0, batch_size) + req_indices_to_remove.add(req_index_to_remove) + + req_indices_to_remove_list = list(req_indices_to_remove) + req_indices_to_remove_list.sort(reverse=True) + req_ids_to_remove: Set[str] = set() + for index in req_indices_to_remove: + input_batch.remove_request(reqs[index].req_id) + req_ids_to_remove.add(reqs[index].req_id) + return (req_ids_to_remove, req_indices_to_remove_list) + + +def _construct_expected_sampling_metadata( + reqs: List[CachedRequestState], req_ids_retained: Set[int], + req_id_index_in_input_batch: Dict[str, int], + device: torch.device) -> SamplingMetadata: + """ + Constructs and returns the expected SamplingMetadata for this + batch. + """ + num_reqs = len(req_ids_retained) + output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)] + prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)] + presence_penalties = [0.0 for _ in range(num_reqs)] + frequency_penalties = [0.0 for _ in range(num_reqs)] + repetition_penalties = [1.0 for _ in range(num_reqs)] + top_k = [0 for _ in range(num_reqs)] + top_p = [0.0 for _ in range(num_reqs)] + temperature = [0.0 for _ in range(num_reqs)] + stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)] + min_tokens = [0 for _ in range(num_reqs)] + for req in reqs: + if req.req_id not in req_ids_retained: + continue + index_in_input_batch = req_id_index_in_input_batch[req.req_id] + output_token_ids[index_in_input_batch] = req.output_token_ids + prompt_token_ids[index_in_input_batch] = req.prompt_token_ids + presence_penalties[ + index_in_input_batch] = req.sampling_params.presence_penalty + frequency_penalties[ + index_in_input_batch] = req.sampling_params.frequency_penalty + repetition_penalties[ + index_in_input_batch] = req.sampling_params.repetition_penalty + top_k[index_in_input_batch] = req.sampling_params.top_k + top_p[index_in_input_batch] = req.sampling_params.top_p + temperature[index_in_input_batch] = req.sampling_params.temperature + stop_token_ids[ + index_in_input_batch] = req.sampling_params.all_stop_token_ids + min_tokens[index_in_input_batch] = req.sampling_params.min_tokens + + + return SamplingMetadata( + temperature=torch.tensor(temperature, dtype=torch.float, device=device), + all_greedy=False, + all_random=True, + top_p=torch.tensor(top_p, dtype=torch.float, device=device), + top_k=torch.tensor(top_k, dtype=torch.int, device=device), + no_top_p=all(x == 1.0 for x in top_p), + no_top_k=all(x == 0 for x in top_k), + generators={}, + max_num_logprobs=0, + prompt_token_ids= make_tensor_with_pad( + prompt_token_ids, + pad=VOCAB_SIZE, + device=torch.device(device), + dtype=torch.int64, + ), + frequency_penalties=torch.tensor( + frequency_penalties, dtype=torch.float, + device=device), + presence_penalties=torch.tensor( + presence_penalties, dtype=torch.float, + device=device), + repetition_penalties=torch.tensor( + repetition_penalties, dtype=torch.float, + device=device), + output_token_ids=output_token_ids, + min_tokens=min_tokens, + stop_token_ids=stop_token_ids, + no_penalties=(all(x ==0 for x in presence_penalties) and \ + all(x ==0 for x in frequency_penalties) and \ + all(x ==1 for x in repetition_penalties)) + ) + + +def _create_sampling_params(): + return SamplingParams(top_k=np.random.randint(1, 10), + top_p=np.random.uniform(0.0, 1.0), + presence_penalty=np.random.uniform(-2.0, 2.0), + repetition_penalty=np.random.uniform(0.0, 2.0), + frequency_penalty=np.random.uniform(-2.0, 2.0), + min_tokens=np.random.randint(1, 10), + stop_token_ids=[ + np.random.randint(0, VOCAB_SIZE) + for _ in range(np.random.randint(10)) + ]) + + +def _construct_cached_request_state(req_id_suffix: int): + prompt_token_ids = [ + np.random.randint(0, VOCAB_SIZE) + for _ in range(np.random.randint(0, MAX_PROMPT_SIZE)) + ] + output_token_ids = [ + np.random.randint(0, VOCAB_SIZE) + for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS)) + ] + return CachedRequestState(req_id=f"req_id_{req_id_suffix}", + prompt_token_ids=prompt_token_ids, + prompt=None, + sampling_params=_create_sampling_params(), + mm_inputs=[], + mm_positions=[], + block_ids=[], + generator=None, + num_computed_tokens=len(output_token_ids), + output_token_ids=output_token_ids) + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2, 32, 64]) +def test_sampling_metadata_in_input_batch(device: str, batch_size: int): + """ + Tests the logic for managing sampling metadata in the InputBatch. + + This test involves adding a set of requests to the InputBatch, + followed by removing a subset of them. Afterward, the batch is compacted, + and the `make_sampling_metadata` method is invoked on the batch. The + output of `make_sampling_metadata` is then compared against the expected + results to ensure correctness. + """ + input_batch: InputBatch = InputBatch(max_num_reqs=batch_size, + max_model_len=1024, + max_num_blocks_per_req=10, + device=torch.device(device), + pin_memory=is_pin_memory_available(), + vocab_size=1024) + reqs: List[CachedRequestState] = [] + req_id_reqs = {} + req_id_output_token_ids = {} + # Add requests + for req_index in range(batch_size): + req: CachedRequestState = _construct_cached_request_state(req_index) + input_batch.add_request(req, req_index) + reqs.append(req) + req_id_reqs[req.req_id] = req + req_id_output_token_ids[req.req_id] = req.output_token_ids + + # Remove some requests + req_ids_to_remove, req_indices_to_remove = _remove_requests( + input_batch, batch_size, reqs) + req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove + + # Compact the input batch + input_batch.condense(req_indices_to_remove) + + # Generate the sampling metadata + sampling_metadata = input_batch.make_sampling_metadata( + req_id_output_token_ids, skip_copy=False) + + # Create expected output. + expected_sampling_metadata = _construct_expected_sampling_metadata( + reqs, + req_ids_retained, + input_batch.req_id_to_index, + device=torch.device(device)) + + # Assert the actual and expected output. + assert torch.allclose(expected_sampling_metadata.temperature, + sampling_metadata.temperature) + assert torch.allclose(expected_sampling_metadata.top_p, + sampling_metadata.top_p) + assert torch.allclose(expected_sampling_metadata.top_k, + sampling_metadata.top_k) + assert torch.allclose(expected_sampling_metadata.frequency_penalties, + sampling_metadata.frequency_penalties) + assert torch.allclose(expected_sampling_metadata.presence_penalties, + sampling_metadata.presence_penalties) + assert torch.allclose(expected_sampling_metadata.repetition_penalties, + sampling_metadata.repetition_penalties) + assert torch.allclose(expected_sampling_metadata.prompt_token_ids, + sampling_metadata.prompt_token_ids) + assert (expected_sampling_metadata.output_token_ids == + sampling_metadata.output_token_ids) + assert ( + expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens) + assert (expected_sampling_metadata.stop_token_ids == + sampling_metadata.stop_token_ids) + assert (expected_sampling_metadata.no_penalties == + sampling_metadata.no_penalties) + assert (expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p) + assert (expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index c10efefea5471..c2d12c466ba45 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -11,6 +11,7 @@ import torch import torch.nn as nn import vllm.envs as envs +from vllm.model_executor.layers.utils import apply_penalties from vllm.model_executor.sampling_metadata import (SamplingMetadata, SamplingTensors, SequenceGroupToSample) @@ -258,11 +259,11 @@ class Sampler(nn.Module): # Apply presence and frequency penalties. if do_penalties: - logits = _apply_penalties(logits, sampling_tensors.prompt_tokens, - sampling_tensors.output_tokens, - sampling_tensors.presence_penalties, - sampling_tensors.frequency_penalties, - sampling_tensors.repetition_penalties) + logits = apply_penalties(logits, sampling_tensors.prompt_tokens, + sampling_tensors.output_tokens, + sampling_tensors.presence_penalties, + sampling_tensors.frequency_penalties, + sampling_tensors.repetition_penalties) # Use float32 to apply temperature scaling. # Use in-place division to avoid creating a new tensor. @@ -336,23 +337,6 @@ class Sampler(nn.Module): return self.should_modify_greedy_probs_inplace -def _get_bin_counts_and_mask( - tokens: torch.Tensor, - vocab_size: int, - num_seqs: int, -) -> Tuple[torch.Tensor, torch.Tensor]: - # Compute the bin counts for the tokens. - # vocab_size + 1 for padding. - bin_counts = torch.zeros((num_seqs, vocab_size + 1), - dtype=torch.long, - device=tokens.device) - bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens)) - bin_counts = bin_counts[:, :vocab_size] - mask = bin_counts > 0 - - return bin_counts, mask - - def _apply_min_tokens_penalty( logits: torch.Tensor, sampling_metadata: SamplingMetadata, @@ -400,29 +384,6 @@ def _apply_min_tokens_penalty( return logits -def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, - output_tokens_tensor: torch.Tensor, - presence_penalties: torch.Tensor, - frequency_penalties: torch.Tensor, - repetition_penalties: torch.Tensor) -> torch.Tensor: - num_seqs, vocab_size = logits.shape - _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size, - num_seqs) - output_bin_counts, output_mask = _get_bin_counts_and_mask( - output_tokens_tensor, vocab_size, num_seqs) - - repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size) - repetition_penalties[~(prompt_mask | output_mask)] = 1.0 - logits = torch.where(logits > 0, logits / repetition_penalties, - logits * repetition_penalties) - - # We follow the definition in OpenAI API. - # Refer to https://platform.openai.com/docs/api-reference/parameter-details - logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts - logits -= presence_penalties.unsqueeze_(dim=1) * output_mask - return logits - - def _apply_top_k_top_p( logits: torch.Tensor, p: torch.Tensor, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py new file mode 100644 index 0000000000000..f6f34cd49d953 --- /dev/null +++ b/vllm/model_executor/layers/utils.py @@ -0,0 +1,57 @@ +"""Utility methods for model layers.""" +from typing import Tuple + +import torch + + +def get_token_bin_counts_and_mask( + tokens: torch.Tensor, + vocab_size: int, + num_seqs: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + # Compute the bin counts for the tokens. + # vocab_size + 1 for padding. + bin_counts = torch.zeros((num_seqs, vocab_size + 1), + dtype=torch.long, + device=tokens.device) + bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens)) + bin_counts = bin_counts[:, :vocab_size] + mask = bin_counts > 0 + + return bin_counts, mask + + +def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, + output_tokens_tensor: torch.Tensor, + presence_penalties: torch.Tensor, + frequency_penalties: torch.Tensor, + repetition_penalties: torch.Tensor) -> torch.Tensor: + """ + Applies penalties in place to the logits tensor + logits : The input logits tensor of shape [num_seqs, vocab_size] + prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts + are padded to the maximum prompt length within the batch using + `vocab_size` as the padding value. The value `vocab_size` is used + for padding because it does not correspond to any valid token ID + in the vocabulary. + output_tokens_tensor: The output tokens tensor. + presence_penalties: The presence penalties of shape (num_seqs, ) + frequency_penalties: The frequency penalties of shape (num_seqs, ) + repetition_penalties: The repetition penalties of shape (num_seqs, ) + """ + num_seqs, vocab_size = logits.shape + _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor, + vocab_size, num_seqs) + output_bin_counts, output_mask = get_token_bin_counts_and_mask( + output_tokens_tensor, vocab_size, num_seqs) + repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat( + 1, vocab_size) + logits[logits > 0] /= torch.where(prompt_mask | output_mask, + repetition_penalties, 1.0)[logits > 0] + logits[logits <= 0] *= torch.where(prompt_mask | output_mask, + repetition_penalties, 1.0)[logits <= 0] + # We follow the definition in OpenAI API. + # Refer to https://platform.openai.com/docs/api-reference/parameter-details + logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts + logits -= presence_penalties.unsqueeze_(dim=1) * output_mask + return logits diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index 9ef36f2e6b212..d60f7eb5d76f9 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Dict +from typing import Dict, List, Optional, Set import torch @@ -19,3 +19,13 @@ class SamplingMetadata: generators: Dict[int, torch.Generator] max_num_logprobs: int + + no_penalties: bool + prompt_token_ids: Optional[torch.Tensor] + frequency_penalties: torch.Tensor + presence_penalties: torch.Tensor + repetition_penalties: torch.Tensor + + output_token_ids: List[List[int]] + min_tokens: List[int] + stop_token_ids: List[Set[int]] diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index d1a755be01ff7..82470fb2610f8 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -1,9 +1,11 @@ """A layer that samples the next tokens from the model's outputs.""" -from typing import Dict +from typing import Dict, List, Set, Tuple import torch import torch.nn as nn +from vllm.model_executor.layers.utils import apply_penalties +from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata @@ -17,9 +19,18 @@ class Sampler(nn.Module): logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> SamplerOutput: + _apply_min_token_penalties(logits, sampling_metadata.output_token_ids, + sampling_metadata.stop_token_ids, + sampling_metadata.min_tokens) + if not sampling_metadata.no_penalties: + assert sampling_metadata.prompt_token_ids is not None + _apply_penalties(logits, sampling_metadata.prompt_token_ids, + sampling_metadata.presence_penalties, + sampling_metadata.frequency_penalties, + sampling_metadata.repetition_penalties, + sampling_metadata.output_token_ids) logits = self.apply_temperature(logits, sampling_metadata.temperature) logits = self.apply_top_k_top_p(logits, sampling_metadata) - probs = self.get_probs(logits) sampled = self.sample(probs, sampling_metadata) # Use int32 to reduce the tensor size. @@ -157,3 +168,53 @@ def _apply_top_k_top_p( # Re-sort the probabilities. logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort) return logits + + +def _apply_min_token_penalties(logits: torch.Tensor, + output_token_ids: List[List[int]], + stop_token_ids: List[Set[int]], + min_tokens: List[int]): + """ + Applies minimum token penalty by setting the logits of the stop tokens + to -inf. + """ + min_tokens_logits_to_penalize: List[Tuple[int, int]] = [] + for index, min_token in enumerate(min_tokens): + if (len(output_token_ids[index]) < min_token): + for stop_token_id in stop_token_ids[index]: + min_tokens_logits_to_penalize.append((index, stop_token_id)) + if min_tokens_logits_to_penalize: + logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf") + + +def _apply_penalties(logits: torch.Tensor, prompt_token_ids: torch.Tensor, + presence_penalties: torch.Tensor, + frequency_penalties: torch.Tensor, + repetition_penalties: torch.Tensor, + output_token_ids: List[List[int]]): + """ + Applies presence, frequency and repetition penalties to the logits. + """ + _, vocab_size = logits.shape + output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size, + logits.device) + return apply_penalties(logits, prompt_token_ids, output_tokens_t, + presence_penalties, frequency_penalties, + repetition_penalties) + + +def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int, + device: torch.device) -> torch.Tensor: + """ + Convert the different list data structures to tensors. + """ + output_tokens_tensor = make_tensor_with_pad( + output_token_ids, + # Use the value of vocab_size as a pad since we don't have a + # token_id of this value. + pad=vocab_size, + device="cpu", + dtype=torch.int64, + pin_memory=is_pin_memory_available(), + ) + return output_tokens_tensor.to(device, non_blocking=True) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 5c113c74778df..6c4d300ec6efe 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -43,12 +43,14 @@ class InputBatch: max_num_blocks_per_req: int, device: torch.device, pin_memory: bool, + vocab_size: int, ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len self.max_num_blocks_per_req = max_num_blocks_per_req self.device = device self.pin_memory = pin_memory + self.vocab_size = vocab_size self.req_ids: List[Optional[str]] = [None] * max_num_reqs self.req_id_to_index: Dict[str, int] = {} @@ -63,6 +65,7 @@ class InputBatch: ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) + self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) # Attention-related. self.block_table = torch.zeros( @@ -110,6 +113,50 @@ class InputBatch: self.top_k_cpu = self.top_k_cpu_tensor.numpy() self.top_k_reqs: Set[str] = set() + # Frequency penalty related data structures + self.frequency_penalties = torch.empty((max_num_reqs, ), + dtype=torch.float, + device=device) + self.frequency_penalties_cpu_tensor = torch.empty( + (max_num_reqs, ), + dtype=torch.float, + device="cpu", + pin_memory=pin_memory) + self.frequency_penalties_cpu = \ + self.frequency_penalties_cpu_tensor.numpy() + self.frequency_penalties_reqs: Set[str] = set() + + # Presence penalty related data structures + self.presence_penalties = torch.empty((max_num_reqs, ), + dtype=torch.float, + device=device) + self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float, + device="cpu", + pin_memory=pin_memory) + self.presence_penalties_cpu = \ + self.presence_penalties_cpu_tensor.numpy() + self.presence_penalties_reqs: Set[str] = set() + + # Repetition penalty related data structures + self.repetition_penalties = torch.empty((max_num_reqs, ), + dtype=torch.float, + device=device) + self.repetition_penalties_cpu_tensor = torch.empty( + (max_num_reqs, ), + dtype=torch.float, + device="cpu", + pin_memory=pin_memory) + self.repetition_penalties_cpu = \ + self.repetition_penalties_cpu_tensor.numpy() + self.repetition_penalties_reqs: Set[str] = set() + + self.min_tokens: List[int] = [0] * max_num_reqs + self.stop_token_ids: List[Set[int]] = [ + set() for _ in range(max_num_reqs) + ] + self.prompt_token_ids: Optional[torch.Tensor] = None + # req_index -> generator # NOTE(woosuk): The indices of the requests that do not have their own # generator should not be included in the dictionary. @@ -133,6 +180,7 @@ class InputBatch: # Copy the prompt token ids and output token ids. num_prompt_tokens = len(request.prompt_token_ids) + self.num_prompt_tokens[req_index] = num_prompt_tokens self.token_ids_cpu[ req_index, :num_prompt_tokens] = request.prompt_token_ids start_idx = num_prompt_tokens @@ -157,6 +205,20 @@ class InputBatch: self.top_k_cpu[req_index] = sampling_params.top_k if sampling_params.top_k > 0: self.top_k_reqs.add(req_id) + self.frequency_penalties_cpu[req_index] = \ + sampling_params.frequency_penalty + if sampling_params.frequency_penalty != 0.0: + self.frequency_penalties_reqs.add(req_id) + self.presence_penalties_cpu[req_index] = \ + sampling_params.presence_penalty + if sampling_params.presence_penalty != 0.0: + self.presence_penalties_reqs.add(req_id) + self.repetition_penalties_cpu[req_index] = \ + sampling_params.repetition_penalty + if sampling_params.repetition_penalty != 1.0: + self.repetition_penalties_reqs.add(req_id) + self.min_tokens[req_index] = sampling_params.min_tokens + self.stop_token_ids[req_index] = sampling_params.all_stop_token_ids # NOTE(woosuk): self.generators should not include the requests that # do not have their own generator. @@ -179,6 +241,9 @@ class InputBatch: self.random_reqs.discard(req_id) self.top_p_reqs.discard(req_id) self.top_k_reqs.discard(req_id) + self.frequency_penalties_reqs.discard(req_id) + self.presence_penalties_reqs.discard(req_id) + self.repetition_penalties_reqs.discard(req_id) self.generators.pop(req_index, None) self.num_logprobs.pop(req_id, None) self.prompt_logprob_reqs.discard(req_id) @@ -191,6 +256,9 @@ class InputBatch: self.random_reqs.clear() self.top_p_reqs.clear() self.top_k_reqs.clear() + self.frequency_penalties_reqs.clear() + self.presence_penalties_reqs.clear() + self.repetition_penalties_reqs.clear() self.generators.clear() self.num_logprobs.clear() self.prompt_logprob_reqs.clear() @@ -224,6 +292,8 @@ class InputBatch: # block_table_cpu. self.token_ids_cpu[empty_index] = self.token_ids_cpu[ last_req_index] + self.num_prompt_tokens[empty_index] = \ + self.num_prompt_tokens[last_req_index] self.num_computed_tokens_cpu[ empty_index] = self.num_computed_tokens_cpu[last_req_index] self.block_table_cpu[empty_index] = self.block_table_cpu[ @@ -232,6 +302,15 @@ class InputBatch: last_req_index] self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index] + self.frequency_penalties_cpu[empty_index] = \ + self.frequency_penalties_cpu[last_req_index] + self.presence_penalties_cpu[empty_index] = \ + self.presence_penalties_cpu[last_req_index] + self.repetition_penalties_cpu[empty_index] = \ + self.repetition_penalties_cpu[last_req_index] + self.min_tokens[empty_index] = self.min_tokens[last_req_index] + self.stop_token_ids[empty_index] = \ + self.stop_token_ids[last_req_index] generator = self.generators.pop(last_req_index, None) if generator is not None: self.generators[empty_index] = generator @@ -241,6 +320,7 @@ class InputBatch: def make_sampling_metadata( self, + req_id_output_token_ids: Dict[str, List[int]], skip_copy: bool = False, ) -> SamplingMetadata: if not skip_copy: @@ -250,6 +330,37 @@ class InputBatch: self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True) self.top_k[:self.num_reqs].copy_( self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True) + if not self.no_penalties: + # Since syncing these tensors is expensive only copy them + # if necessary i.e. if there are requests which require + # penalties to be applied during sampling. + self.frequency_penalties[:self.num_reqs].copy_( + self.frequency_penalties_cpu_tensor[:self.num_reqs], + non_blocking=True) + self.presence_penalties[:self.num_reqs].copy_( + self.presence_penalties_cpu_tensor[:self.num_reqs], + non_blocking=True) + self.repetition_penalties[:self.num_reqs].copy_( + self.repetition_penalties_cpu_tensor[:self.num_reqs], + non_blocking=True) + # The prompt tokens are used only for applying penalties during + # the sampling process. Hence copy these tensors only when + # there are requests which need penalties to be applied. + self.prompt_token_ids = self._make_prompt_token_ids_tensor() + + output_token_ids: List[List[int]] = [] + + for req_id in self.req_ids[:self.num_reqs]: + assert req_id is not None + # Currently we create a tensor for output_token_ids from scratch + # at each step. However, for the penalties computation what we + # need is stats about the token ids present in the output. This + # stats can be maintained incrementally instead of computing it + # from scratch at each step. + # TODO - Replace this with incremental update to output token + # statistics. + output_token_ids.append(req_id_output_token_ids[req_id]) + return SamplingMetadata( temperature=self.temperature[:self.num_reqs], all_greedy=self.all_greedy, @@ -260,8 +371,33 @@ class InputBatch: no_top_k=self.no_top_k, generators=self.generators, max_num_logprobs=self.max_num_logprobs, + prompt_token_ids=self.prompt_token_ids, + frequency_penalties=self.frequency_penalties[:self.num_reqs], + presence_penalties=self.presence_penalties[:self.num_reqs], + repetition_penalties=self.repetition_penalties[:self.num_reqs], + output_token_ids=output_token_ids, + min_tokens=self.min_tokens[:self.num_reqs], + stop_token_ids=self.stop_token_ids[:self.num_reqs], + no_penalties=self.no_penalties, ) + def _make_prompt_token_ids_tensor(self) -> torch.Tensor: + max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max() + prompt_token_ids_cpu_tensor = torch.empty( + (self.num_reqs, max_prompt_len), + device="cpu", + dtype=torch.int64, + pin_memory=self.pin_memory) + prompt_token_ids = prompt_token_ids_cpu_tensor.numpy() + prompt_token_ids[:] = ( + self.token_ids_cpu[:self.num_reqs, :max_prompt_len]) + # Use the value of vocab_size as a pad since we don't have a + # token_id of this value. + for i in range(self.num_reqs): + prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size + return prompt_token_ids_cpu_tensor.to(device=self.device, + non_blocking=True) + @property def num_reqs(self) -> int: return len(self.req_id_to_index) @@ -282,6 +418,12 @@ class InputBatch: def no_top_k(self) -> bool: return len(self.top_k_reqs) == 0 + @property + def no_penalties(self) -> bool: + return (len(self.presence_penalties_reqs) == 0 + and len(self.frequency_penalties_reqs) == 0 + and len(self.repetition_penalties_reqs) == 0) + @property def max_num_logprobs(self) -> int: return max(self.num_logprobs.values()) if self.num_logprobs else 0 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ace62d8978bea..509771b7e2e5a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -105,6 +105,7 @@ class GPUModelRunner: max_num_blocks_per_req=self.max_num_blocks_per_req, device=self.device, pin_memory=self.pin_memory, + vocab_size=model_config.get_vocab_size(), ) self.use_cuda_graph = (self.vllm_config.compilation_config.level @@ -383,7 +384,12 @@ class GPUModelRunner: or scheduler_output.scheduled_resumed_reqs): skip_copy = False # Create the sampling metadata. - sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy) + req_id_output_token_ids: Dict[str, List[int]] = \ + {req_id: req.output_token_ids \ + for req_id, req in self.requests.items()} + + sampling_metadata = self.input_batch.make_sampling_metadata( + req_id_output_token_ids, skip_copy) return sampling_metadata def _execute_encoder(self, scheduler_output: "SchedulerOutput"): From f57ee5650dd402c6147980824c6936c96cfa59fe Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 26 Dec 2024 21:12:05 +0800 Subject: [PATCH 6/8] [Model] Modify MolmoForCausalLM MLP (#11510) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/molmo.py | 42 ++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 63a25137f8aa9..8938f62d0c494 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -464,24 +464,27 @@ class MolmoAttention(nn.Module): class MolmoMLP(nn.Module): """Molmo's LLM mlp.""" - def __init__( - self, - config: PretrainedConfig, - input_dim: Optional[int] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, + config: PretrainedConfig, + input_dim: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + proj_name: str = "gate_up_proj") -> None: super().__init__() self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size // 2 - # Feed-forward input projection. - self.gate_up_proj = MergedColumnParallelLinear( - input_dim or self.hidden_size, - [self.intermediate_size] * 2, - bias=False, - quant_config=quant_config, - ) - + # Molmo's LLM proj weights are already merged into the disk, while + # image_projector proj is separate. If the same proj_name were used, it + # would create ambiguity and make it difficult to support BNB and LoRA. + self.proj_name = proj_name + setattr( + self, proj_name, + MergedColumnParallelLinear( + input_dim or self.hidden_size, + [self.intermediate_size] * 2, + bias=False, + quant_config=quant_config, + )) # Activation function. self.act_fn = SiluAndMul() @@ -497,7 +500,7 @@ class MolmoMLP(nn.Module): self, x: torch.Tensor, ) -> torch.Tensor: - gate_up, _ = self.gate_up_proj(x) + gate_up, _ = getattr(self, self.proj_name)(x) x = self.act_fn(gate_up) x, _ = self.down_proj(x) return x @@ -520,7 +523,9 @@ class MolmoDecoderLayer(nn.Module): prefix=f"{prefix}.self_attn") # MLP block. - self.mlp = MolmoMLP(config, quant_config=quant_config) + self.mlp = MolmoMLP(config, + quant_config=quant_config, + proj_name="gate_up_proj") # LayerNorm assert config.layer_norm_type == "rms" @@ -616,6 +621,7 @@ class MolmoVisionBackbone(nn.Module): config, input_dim=vision_config.image_emb_dim, quant_config=quant_config, + proj_name="merged_linear", ) image_dim = vision_config.image_emb_dim * len(self.vit_layers) @@ -714,8 +720,8 @@ class MolmoVisionBackbone(nn.Module): torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), + ("merged_linear", "gate_proj", 0), + ("merged_linear", "up_proj", 1), ] params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() From eec906d8114cd786315e49ab7f5a3093d1896880 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 26 Dec 2024 21:12:51 +0800 Subject: [PATCH 7/8] [Misc] Add placeholder module (#11501) Signed-off-by: DarkLight1337 --- tests/tensorizer_loader/test_tensorizer.py | 9 ++- vllm/assets/audio.py | 19 +++--- vllm/assets/base.py | 7 +-- vllm/assets/image.py | 3 +- vllm/assets/video.py | 9 +-- vllm/config.py | 10 +-- vllm/model_executor/model_loader/loader.py | 11 +--- .../model_executor/model_loader/tensorizer.py | 25 ++++---- .../model_loader/weight_utils.py | 17 ++--- vllm/multimodal/audio.py | 24 ++----- vllm/multimodal/utils.py | 26 +++++--- vllm/multimodal/video.py | 13 +--- vllm/transformers_utils/s3_utils.py | 7 ++- vllm/utils.py | 63 +++++++++++++++++++ 14 files changed, 143 insertions(+), 100 deletions(-) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index edd079bc7a389..0b0792b6b845f 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -9,7 +9,6 @@ import openai import pytest import torch from huggingface_hub import snapshot_download -from tensorizer import EncryptionParams from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -23,12 +22,18 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, serialize_vllm_model, tensorize_vllm_model) # yapf: enable -from vllm.utils import import_from_path +from vllm.utils import PlaceholderModule, import_from_path from ..conftest import VllmRunner from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip +try: + from tensorizer import EncryptionParams +except ImportError: + tensorizer = PlaceholderModule("tensorizer") # type: ignore[assignment] + EncryptionParams = tensorizer.placeholder_attr("EncryptionParams") + EXAMPLES_PATH = VLLM_PATH / "examples" prompts = [ diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index 49bb6aeee90bc..9033644e3264a 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -1,11 +1,17 @@ from dataclasses import dataclass -from typing import Literal, Tuple +from typing import Literal from urllib.parse import urljoin -import librosa -import numpy as np +import numpy.typing as npt -from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL +from vllm.utils import PlaceholderModule + +from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] ASSET_DIR = "multimodal_asset" @@ -15,8 +21,7 @@ class AudioAsset: name: Literal["winning_call", "mary_had_lamb"] @property - def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]: - + def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]: audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", s3_prefix=ASSET_DIR) y, sr = librosa.load(audio_path, sr=None) @@ -25,4 +30,4 @@ class AudioAsset: @property def url(self) -> str: - return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg") + return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg") diff --git a/vllm/assets/base.py b/vllm/assets/base.py index f97e8c218f65b..249173141106c 100644 --- a/vllm/assets/base.py +++ b/vllm/assets/base.py @@ -4,9 +4,8 @@ from typing import Optional import vllm.envs as envs from vllm.connections import global_http_connection -from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT -vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com" +VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com" def get_cache_dir() -> Path: @@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str, if s3_prefix is not None: filename = s3_prefix + "/" + filename global_http_connection.download_file( - f"{vLLM_S3_BUCKET_URL}/{filename}", + f"{VLLM_S3_BUCKET_URL}/{filename}", asset_path, - timeout=VLLM_IMAGE_FETCH_TIMEOUT) + timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT) return asset_path diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 389ecd5c869bc..cb831cb0b5bb4 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -4,7 +4,7 @@ from typing import Literal import torch from PIL import Image -from vllm.assets.base import get_vllm_public_assets +from .base import get_vllm_public_assets VLM_IMAGES_DIR = "vision_model_images" @@ -15,7 +15,6 @@ class ImageAsset: @property def pil_image(self) -> Image.Image: - image_path = get_vllm_public_assets(filename=f"{self.name}.jpg", s3_prefix=VLM_IMAGES_DIR) return Image.open(image_path) diff --git a/vllm/assets/video.py b/vllm/assets/video.py index e6779935bad17..eca2ccc54482c 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -2,13 +2,13 @@ from dataclasses import dataclass from functools import lru_cache from typing import List, Literal +import cv2 import numpy as np import numpy.typing as npt from huggingface_hub import hf_hub_download from PIL import Image -from vllm.multimodal.video import (sample_frames_from_video, - try_import_video_packages) +from vllm.multimodal.video import sample_frames_from_video from .base import get_cache_dir @@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str: Download and open an image from huggingface repo: raushan-testing-hf/videos-test """ - video_directory = get_cache_dir() / "video-eample-data" + video_directory = get_cache_dir() / "video-example-data" video_directory.mkdir(parents=True, exist_ok=True) video_path = video_directory / filename @@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str: def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: - cv2, _ = try_import_video_packages() - cap = cv2.VideoCapture(path) if not cap.isOpened(): raise ValueError(f"Could not open video file {path}") @@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: def video_to_pil_images_list(path: str, num_frames: int = -1) -> List[Image.Image]: - cv2, _ = try_import_video_packages() frames = video_to_ndarrays(path, num_frames) return [ Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) diff --git a/vllm/config.py b/vllm/config.py index 17602bda15c69..de8ba029ddc23 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -29,6 +29,7 @@ from vllm.transformers_utils.config import ( get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, try_get_generation_config, uses_mrope) +from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, get_cpu_memory, print_warning_once, random_uuid, @@ -372,15 +373,6 @@ class ModelConfig: """ if is_s3(model) or is_s3(tokenizer): - try: - from vllm.transformers_utils.s3_utils import S3Model - except ImportError as err: - raise ImportError( - "Please install Run:ai optional dependency " - "to use the S3 capabilities. " - "You can install it with: pip install vllm[runai]" - ) from err - if is_s3(model): self.s3_model = S3Model() self.s3_model.pull_files(model, allow_pattern=["*config.json"]) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 24e554e6060ab..f2d9293b31a83 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -48,6 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import ( runai_safetensors_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.transformers_utils.s3_utils import glob as s3_glob from vllm.transformers_utils.utils import is_s3 from vllm.utils import is_pin_memory_available @@ -1269,16 +1270,6 @@ class RunaiModelStreamerLoader(BaseModelLoader): If the model is not local, it will be downloaded.""" is_s3_path = is_s3(model_name_or_path) - if is_s3_path: - try: - from vllm.transformers_utils.s3_utils import glob as s3_glob - except ImportError as err: - raise ImportError( - "Please install Run:ai optional dependency " - "to use the S3 capabilities. " - "You can install it with: pip install vllm[runai]" - ) from err - is_local = os.path.isdir(model_name_or_path) safetensors_pattern = "*.safetensors" index_file = SAFE_WEIGHTS_INDEX_NAME diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 87f3fcb5cae00..8b929f299c8d8 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -19,9 +19,7 @@ from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.utils import FlexibleArgumentParser - -tensorizer_error_msg = None +from vllm.utils import FlexibleArgumentParser, PlaceholderModule try: from tensorizer import (DecryptionParams, EncryptionParams, @@ -34,8 +32,19 @@ try: open_stream, mode=mode, ) for mode in ("rb", "wb+")) -except ImportError as e: - tensorizer_error_msg = str(e) +except ImportError: + tensorizer = PlaceholderModule("tensorizer") + DecryptionParams = tensorizer.placeholder_attr("DecryptionParams") + EncryptionParams = tensorizer.placeholder_attr("EncryptionParams") + TensorDeserializer = tensorizer.placeholder_attr("TensorDeserializer") + TensorSerializer = tensorizer.placeholder_attr("TensorSerializer") + open_stream = tensorizer.placeholder_attr("stream_io.open_stream") + convert_bytes = tensorizer.placeholder_attr("utils.convert_bytes") + get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage") + no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor") + + _read_stream = tensorizer.placeholder_attr("_read_stream") + _write_stream = tensorizer.placeholder_attr("_write_stream") __all__ = [ 'EncryptionParams', 'DecryptionParams', 'TensorDeserializer', @@ -267,12 +276,6 @@ class TensorizerAgent: """ def __init__(self, tensorizer_config: TensorizerConfig, vllm_config): - if tensorizer_error_msg is not None: - raise ImportError( - "Tensorizer is not installed. Please install tensorizer " - "to use this feature with `pip install vllm[tensorizer]`. " - "Error message: {}".format(tensorizer_error_msg)) - self.tensorizer_config = tensorizer_config self.tensorizer_args = ( self.tensorizer_config._construct_tensorizer_args()) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f2a9e7e2687cb..8aa0c98df70d2 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -25,7 +25,15 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig, get_quantization_config) from vllm.model_executor.layers.quantization.schema import QuantParamSchema from vllm.platforms import current_platform -from vllm.utils import print_warning_once +from vllm.utils import PlaceholderModule, print_warning_once + +try: + from runai_model_streamer import SafetensorsStreamer +except ImportError: + runai_model_streamer = PlaceholderModule( + "runai_model_streamer") # type: ignore[assignment] + SafetensorsStreamer = runai_model_streamer.placeholder_attr( + "SafetensorsStreamer") logger = init_logger(__name__) @@ -414,13 +422,6 @@ def runai_safetensors_weights_iterator( hf_weights_files: List[str] ) -> Generator[Tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" - try: - from runai_model_streamer import SafetensorsStreamer - except ImportError as err: - raise ImportError( - "Please install Run:ai optional dependency." - "You can install it with: pip install vllm[runai]") from err - enable_tqdm = not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0 with SafetensorsStreamer() as streamer: diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 314d21b746236..ed3bb82bf0aaa 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,13 +1,17 @@ -from typing import Any - import numpy as np import numpy.typing as npt from vllm.inputs.registry import InputContext +from vllm.utils import PlaceholderModule from .base import MultiModalPlugin from .inputs import AudioItem, MultiModalData, MultiModalKwargs +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] + class AudioPlugin(MultiModalPlugin): """Plugin for audio data.""" @@ -28,26 +32,10 @@ class AudioPlugin(MultiModalPlugin): "There is no default maximum multimodal tokens") -def try_import_audio_packages() -> tuple[Any, Any]: - try: - import librosa - import soundfile - except ImportError as exc: - raise ImportError( - "Please install vllm[audio] for audio support.") from exc - return librosa, soundfile - - def resample_audio( audio: npt.NDArray[np.floating], *, orig_sr: float, target_sr: float, ) -> npt.NDArray[np.floating]: - try: - import librosa - except ImportError as exc: - msg = "Please install vllm[audio] for audio support." - raise ImportError(msg) from exc - return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 1cb9036bdfda3..a49da2bdee972 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -13,10 +13,24 @@ import vllm.envs as envs from vllm.connections import global_http_connection from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from vllm.utils import PlaceholderModule -from .audio import try_import_audio_packages from .inputs import MultiModalDataDict, PlaceholderRange -from .video import try_import_video_packages + +try: + import decord +except ImportError: + decord = PlaceholderModule("decord") # type: ignore[assignment] + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") # type: ignore[assignment] + +try: + import soundfile +except ImportError: + soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] logger = init_logger(__name__) @@ -128,8 +142,6 @@ async def async_fetch_image(image_url: str, def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray: - _, decord = try_import_video_packages() - video_path = BytesIO(b) vr = decord.VideoReader(video_path, num_threads=1) total_frame_num = len(vr) @@ -204,8 +216,6 @@ def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: """ Load audio from a URL. """ - librosa, _ = try_import_audio_packages() - if audio_url.startswith("http"): audio_bytes = global_http_connection.get_bytes( audio_url, @@ -226,8 +236,6 @@ async def async_fetch_audio( """ Asynchronously fetch audio from a URL. """ - librosa, _ = try_import_audio_packages() - if audio_url.startswith("http"): audio_bytes = await global_http_connection.async_get_bytes( audio_url, @@ -286,8 +294,6 @@ def encode_audio_base64( sampling_rate: int, ) -> str: """Encode audio as base64.""" - _, soundfile = try_import_audio_packages() - buffered = BytesIO() soundfile.write(buffered, audio, sampling_rate, format="WAV") diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index bfcdef70718bc..c4be100562703 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,6 +1,7 @@ from functools import lru_cache from typing import TYPE_CHECKING, Any, Dict, Optional +import cv2 import numpy as np import numpy.typing as npt @@ -78,19 +79,7 @@ class VideoPlugin(ImagePlugin): return 4096 -def try_import_video_packages() -> tuple[Any, Any]: - try: - import cv2 - import decord - except ImportError as exc: - raise ImportError( - "Please install vllm[video] for video support.") from exc - return cv2, decord - - def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: - cv2, _ = try_import_video_packages() - num_frames, _, _, channels = frames.shape new_height, new_width = size resized_frames = np.empty((num_frames, new_height, new_width, channels), diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 6f63dab74d696..6ae68161bbd97 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -6,7 +6,12 @@ import tempfile from pathlib import Path from typing import Optional -import boto3 +from vllm.utils import PlaceholderModule + +try: + import boto3 +except ImportError: + boto3 = PlaceholderModule("boto3") # type: ignore[assignment] def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]: diff --git a/vllm/utils.py b/vllm/utils.py index 49e532540d7ee..7d290dcb7dad0 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -6,10 +6,12 @@ import datetime import enum import gc import getpass +import importlib.metadata import importlib.util import inspect import ipaddress import os +import re import signal import socket import subprocess @@ -1550,6 +1552,67 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): return module +@lru_cache(maxsize=None) +def get_vllm_optional_dependencies(): + metadata = importlib.metadata.metadata("vllm") + requirements = metadata.get_all("Requires-Dist", []) + extras = metadata.get_all("Provides-Extra", []) + + return { + extra: [ + re.split(r";|>=|<=|==", req)[0] for req in requirements + if req.endswith(f'extra == "{extra}"') + ] + for extra in extras + } + + +@dataclass(frozen=True) +class PlaceholderModule: + """ + A placeholder object to use when a module does not exist. + + This enables more informative errors when trying to access attributes + of a module that does not exists. + """ + name: str + + def placeholder_attr(self, attr_path: str): + return _PlaceholderModuleAttr(self, attr_path) + + def __getattr__(self, key: str): + name = self.name + + try: + importlib.import_module(self.name) + except ImportError as exc: + for extra, names in get_vllm_optional_dependencies().items(): + if name in names: + msg = f"Please install vllm[{extra}] for {extra} support" + raise ImportError(msg) from exc + + raise exc + + raise AssertionError("PlaceholderModule should not be used " + "when the original module can be imported") + + +@dataclass(frozen=True) +class _PlaceholderModuleAttr: + module: PlaceholderModule + attr_path: str + + def placeholder_attr(self, attr_path: str): + return _PlaceholderModuleAttr(self.module, + f"{self.attr_path}.{attr_path}") + + def __getattr__(self, key: str): + getattr(self.module, f"{self.attr_path}.{key}") + + raise AssertionError("PlaceholderModule should not be used " + "when the original module can be imported") + + # create a library to hold the custom op vllm_lib = Library("vllm", "FRAGMENT") # noqa From b85a977822c4216430c5a27a2fc47c93277e4b29 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 27 Dec 2024 01:31:29 +0800 Subject: [PATCH 8/8] [Doc] Add video example to openai client for multimodal (#11521) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung --- docs/source/usage/multimodal_inputs.md | 52 ++++++++++++- ...i_chat_completion_client_for_multimodal.py | 73 +++++++++++++++++-- 2 files changed, 114 insertions(+), 11 deletions(-) diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md index 82a3f3b8909a1..4f45a9f448cf0 100644 --- a/docs/source/usage/multimodal_inputs.md +++ b/docs/source/usage/multimodal_inputs.md @@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT= ### Video -Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. +Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf). -You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference. +First, launch the OpenAI-compatible server: + +```bash +vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192 +``` + +Then, you can use the OpenAI client as follows: +```python +from openai import OpenAI + +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" + +## Use video url in the payload +chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this video?" + }, + { + "type": "video_url", + "video_url": { + "url": video_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, +) + +result = chat_completion_from_url.choices[0].message.content +print("Chat completion output from image url:", result) +``` + +Full example: ````{note} -By default, the timeout for fetching videos through HTTP URL url is `30` seconds. +By default, the timeout for fetching videos through HTTP URL is `30` seconds. You can override this by setting the environment variable: ```console diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py index 6a160fd70423f..213d075542e81 100644 --- a/examples/openai_chat_completion_client_for_multimodal.py +++ b/examples/openai_chat_completion_client_for_multimodal.py @@ -18,7 +18,6 @@ import base64 import requests from openai import OpenAI -from vllm.assets.audio import AudioAsset from vllm.utils import FlexibleArgumentParser # Modify OpenAI's API key and API base to use vLLM's API server. @@ -151,8 +150,66 @@ def run_multi_image() -> None: print("Chat completion output:", result) +# Video input inference +def run_video() -> None: + video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" + video_base64 = encode_base64_content_from_url(video_url) + + ## Use video url in the payload + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this video?" + }, + { + "type": "video_url", + "video_url": { + "url": video_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from image url:", result) + + ## Use base64 encoded video in the payload + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this video?" + }, + { + "type": "video_url", + "video_url": { + "url": f"data:video/mp4;base64,{video_base64}" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from base64 encoded image:", result) + + # Audio input inference def run_audio() -> None: + from vllm.assets.audio import AudioAsset + audio_url = AudioAsset("winning_call").url audio_base64 = encode_base64_content_from_url(audio_url) @@ -240,6 +297,7 @@ example_function_map = { "text-only": run_text_only, "single-image": run_single_image, "multi-image": run_multi_image, + "video": run_video, "audio": run_audio, } @@ -253,12 +311,11 @@ if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using OpenAI client for online inference with ' 'multimodal language models served with vLLM.') - parser.add_argument( - '--chat-type', - '-c', - type=str, - default="single-image", - choices=["text-only", "single-image", "multi-image", "audio"], - help='Conversation type with multimodal data.') + parser.add_argument('--chat-type', + '-c', + type=str, + default="single-image", + choices=list(example_function_map.keys()), + help='Conversation type with multimodal data.') args = parser.parse_args() main(args)