From 51a624bf024e351e678b598521b72a2e19b5e2ef Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Dec 2024 12:23:20 +0800
Subject: [PATCH 1/8] [Misc] Move some multimodal utils to modality-specific
 modules (#11494)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../decoder_only/vision_language/test_awq.py  |  2 +-
 .../vision_language/test_h2ovl.py             |  2 +-
 .../vision_language/test_phi3v.py             |  2 +-
 .../vision_language/test_qwen2_vl.py          |  4 +-
 .../vision_language/vlm_utils/builders.py     |  5 +-
 .../vlm_utils/custom_inputs.py                |  5 +-
 .../vision_language/test_mllama.py            |  2 +-
 tests/multimodal/test_mapper.py               |  2 +-
 vllm/assets/video.py                          |  2 +-
 vllm/multimodal/audio.py                      | 12 ++++
 vllm/multimodal/image.py                      | 12 ++++
 vllm/multimodal/utils.py                      | 68 +------------------
 vllm/multimodal/video.py                      | 43 ++++++++++++
 13 files changed, 84 insertions(+), 77 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
index 6e6e5b40d6a35..18ceb34a4e042 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -3,7 +3,7 @@ from typing import List, Optional, Type
 import pytest
 import torch
 
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
index 45a7365204403..7406df253e7f0 100644
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -8,7 +8,7 @@ from transformers import AutoConfig
 # Import the functions to test
 from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
                                               image_to_pixel_values_wrapper)
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 models = [
     "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index 82eae0705c9ba..3a8934adfb076 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple, Type
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 71b6ba4dca435..51fe7d2ad32a8 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -6,8 +6,8 @@ import torch
 from PIL import Image
 
 from vllm.entrypoints.llm import LLM
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
 
 from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
                           PromptVideoInput, VllmRunner)
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
index 66668296139f5..59773be709fa8 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -5,8 +5,9 @@ from typing import Callable, Iterable, List, Optional, Tuple, Union
 
 import torch
 
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   resize_video, sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 
 from .....conftest import _ImageAssets, _VideoAssets
 from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
index e698d8d3f6f56..2291f4fa0d0ac 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -1,8 +1,9 @@
 """Custom input builders for edge-cases in different models."""
 from typing import Callable
 
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
-                                   resize_video, sample_frames_from_video)
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 
 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 77dd1d81f84d7..636a3eedff31b 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -6,7 +6,7 @@ from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
 
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 71832acbd17b8..81f2a06182bcc 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -6,7 +6,7 @@ from transformers import LlavaNextImageProcessor
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 
 
 @pytest.fixture
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index e4dcab10466db..e6779935bad17 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -7,7 +7,7 @@ import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.utils import (sample_frames_from_video,
+from vllm.multimodal.video import (sample_frames_from_video,
                                    try_import_video_packages)
 
 from .base import get_cache_dir
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index c92deddbcb255..314d21b746236 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 import numpy as np
 import numpy.typing as npt
 
@@ -26,6 +28,16 @@ class AudioPlugin(MultiModalPlugin):
             "There is no default maximum multimodal tokens")
 
 
+def try_import_audio_packages() -> tuple[Any, Any]:
+    try:
+        import librosa
+        import soundfile
+    except ImportError as exc:
+        raise ImportError(
+            "Please install vllm[audio] for audio support.") from exc
+    return librosa, soundfile
+
+
 def resample_audio(
     audio: npt.NDArray[np.floating],
     *,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 97bbce1ce1570..c705e1a3d1554 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -84,3 +84,15 @@ class ImagePlugin(MultiModalPlugin):
 
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 3000
+
+
+def rescale_image_size(image: Image.Image,
+                       size_factor: float,
+                       transpose: int = -1) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
+    image = image.resize((new_width, new_height))
+    if transpose >= 0:
+        image = image.transpose(Image.Transpose(transpose))
+    return image
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index be9643598448d..1cb9036bdfda3 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -2,7 +2,7 @@ import base64
 import os
 from functools import lru_cache
 from io import BytesIO
-from typing import Any, List, Optional, Tuple, TypeVar, Union
+from typing import List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 import numpy.typing as npt
@@ -14,7 +14,9 @@ from vllm.connections import global_http_connection
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
+from .audio import try_import_audio_packages
 from .inputs import MultiModalDataDict, PlaceholderRange
+from .video import try_import_video_packages
 
 logger = init_logger(__name__)
 
@@ -198,16 +200,6 @@ async def async_fetch_video(video_url: str,
     return video
 
 
-def try_import_audio_packages() -> Tuple[Any, Any]:
-    try:
-        import librosa
-        import soundfile
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[audio] for audio support.") from exc
-    return librosa, soundfile
-
-
 def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
     """
     Load audio from a URL.
@@ -324,60 +316,6 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     return _load_image_from_bytes(base64.b64decode(image))
 
 
-def rescale_image_size(image: Image.Image,
-                       size_factor: float,
-                       transpose: int = -1) -> Image.Image:
-    """Rescale the dimensions of an image by a constant factor."""
-    new_width = int(image.width * size_factor)
-    new_height = int(image.height * size_factor)
-    image = image.resize((new_width, new_height))
-    if transpose >= 0:
-        image = image.transpose(Image.Transpose(transpose))
-    return image
-
-
-def try_import_video_packages():
-    try:
-        import cv2
-        import decord
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[video] for video support.") from exc
-    return cv2, decord
-
-
-def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-
-    num_frames, _, _, channels = frames.shape
-    new_height, new_width = size
-    resized_frames = np.empty((num_frames, new_height, new_width, channels),
-                              dtype=frames.dtype)
-    for i, frame in enumerate(frames):
-        resized_frame = cv2.resize(frame, (new_width, new_height))
-        resized_frames[i] = resized_frame
-    return resized_frames
-
-
-def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
-    _, height, width, _ = frames.shape
-    new_height = int(height * size_factor)
-    new_width = int(width * size_factor)
-
-    return resize_video(frames, (new_height, new_width))
-
-
-def sample_frames_from_video(frames: npt.NDArray,
-                             num_frames: int) -> npt.NDArray:
-    total_frames = frames.shape[0]
-    if num_frames == -1:
-        return frames
-    else:
-        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
-        sampled_frames = frames[frame_indices, ...]
-        return sampled_frames
-
-
 def encode_video_base64(frames: npt.NDArray) -> str:
     base64_frames = []
     frames_list = [frames[i] for i in range(frames.shape[0])]
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index ba9bf58a4a20c..bfcdef70718bc 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -2,6 +2,7 @@ from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import numpy as np
+import numpy.typing as npt
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
@@ -75,3 +76,45 @@ class VideoPlugin(ImagePlugin):
 
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
         return 4096
+
+
+def try_import_video_packages() -> tuple[Any, Any]:
+    try:
+        import cv2
+        import decord
+    except ImportError as exc:
+        raise ImportError(
+            "Please install vllm[video] for video support.") from exc
+    return cv2, decord
+
+
+def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
+    cv2, _ = try_import_video_packages()
+
+    num_frames, _, _, channels = frames.shape
+    new_height, new_width = size
+    resized_frames = np.empty((num_frames, new_height, new_width, channels),
+                              dtype=frames.dtype)
+    for i, frame in enumerate(frames):
+        resized_frame = cv2.resize(frame, (new_width, new_height))
+        resized_frames[i] = resized_frame
+    return resized_frames
+
+
+def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray:
+    _, height, width, _ = frames.shape
+    new_height = int(height * size_factor)
+    new_width = int(width * size_factor)
+
+    return resize_video(frames, (new_height, new_width))
+
+
+def sample_frames_from_video(frames: npt.NDArray,
+                             num_frames: int) -> npt.NDArray:
+    total_frames = frames.shape[0]
+    if num_frames == -1:
+        return frames
+
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    sampled_frames = frames[frame_indices, ...]
+    return sampled_frames

From dbeac95dbbf898bcc0965528fc767e9cadbbe0c5 Mon Sep 17 00:00:00 2001
From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com>
Date: Wed, 25 Dec 2024 23:04:07 -0600
Subject: [PATCH 2/8] Mypy checking for vllm/compilation (#11496)

Signed-off-by: lucast2021 <lucast2021@headroyce.org>
Co-authored-by: lucast2021 <lucast2021@headroyce.org>
---
 vllm/compilation/backends.py           | 10 +++++-----
 vllm/compilation/multi_output_match.py |  3 ++-
 vllm/compilation/pass_manager.py       |  4 ++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 0c7bbfe599b02..826d1744d88a5 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -141,14 +141,14 @@ class AlwaysHitShapeEnv:
         return ""
 
 
-def wrap_inductor(graph,
+def wrap_inductor(graph: fx.GraphModule,
                   example_inputs,
                   additional_inductor_config,
                   compilation_config: CompilationConfig,
                   graph_index: int = 0,
                   num_graphs: int = 1,
                   runtime_shape: Optional[int] = None,
-                  use_inductor: bool = True):
+                  use_inductor: bool = True) -> Any:
     if graph_index == 0:
         # before compiling the first graph, record the start time
         global compilation_start_time
@@ -209,7 +209,7 @@ def wrap_inductor(graph,
         returns_tuple = graph_returns_tuple(graph)
 
         # this is the graph we return to Dynamo to run
-        def compiled_graph(*args):
+        def compiled_graph(*args) -> Optional[fx.CompiledFxGraph]:
             # convert args to list
             list_args = list(args)
             graph_output = inductor_compiled_graph(list_args)
@@ -247,7 +247,7 @@ def wrap_inductor(graph,
             # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
             return
 
-        def _get_shape_env():
+        def _get_shape_env() -> AlwaysHitShapeEnv:
             return AlwaysHitShapeEnv()
 
         with patch(# for hijacking the hash of the compiled graph
@@ -537,7 +537,7 @@ class VllmBackend:
             example_inputs[x].clone() for x in self.sym_tensor_indices
         ]
 
-        def copy_and_call(*args):
+        def copy_and_call(*args) -> fx.GraphModule:
             list_args = list(args)
             for i, index in enumerate(self.sym_tensor_indices):
                 runtime_tensor = list_args[index]
diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py
index 0ad648abfbb3a..b6bcecdc89e26 100644
--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -7,6 +7,7 @@ from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor import pattern_matcher as pm
 from torch._ops import OpOverload
+from torch.fx import Node
 
 from vllm.compilation.fx_utils import find_auto_fn
 
@@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC):
                 self.graph.call_function(operator.getitem, (tuple_node, idx))
                 for idx in indices)
 
-    def insert_auto_fn(self, op: OpOverload, kwargs):
+    def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
         """
         Insert an auto_functionalized node with the given op and kwargs.
         """
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index fb522ae053e97..34f5f355798b2 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Any, Dict, List
 
 from torch import fx as fx
 
@@ -53,7 +53,7 @@ class PostGradPassManager:
         assert isinstance(pass_, InductorPass)
         self.passes.append(pass_)
 
-    def __getstate__(self):
+    def __getstate__(self) -> Dict[str, List[Any]]:
         """
         Custom pickling for the pass manager, as some passes cannot be pickled.
         Pickling occurs because the pass manager is set as the value of

From aa25985bd1e7a4925a7061fdfbc93893b492627b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 26 Dec 2024 15:52:48 +0800
Subject: [PATCH 3/8] [Misc][LoRA] Fix LoRA weight mapper (#11495)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_lora_checkpoints.py | 15 +++++++++-----
 tests/lora/test_qwen2vl.py          |  6 +++++-
 vllm/lora/models.py                 |  3 ++-
 vllm/lora/utils.py                  | 32 ++++++++++-------------------
 vllm/lora/worker_manager.py         |  2 ++
 5 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 9842203eb15e0..537d95b025a9d 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -74,7 +74,7 @@ def test_load_checkpoints(
                 embedding_padding_modules=embed_padding_modules)
 
 
-def test_lora_weights_mapping(baichuan_lora_files, ):
+def test_lora_weights_mapping(baichuan_lora_files):
     supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
@@ -86,10 +86,14 @@ def test_lora_weights_mapping(baichuan_lora_files, ):
         else:
             expected_lora_modules.append(module)
 
-    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
-        "model.": "language_model.model.",
-    }, )
-
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+        },
+        orig_to_new_substr={
+            ".layers.": ".baichuan_layers.",
+        },
+    )
     lora_model = LoRAModel.from_local_checkpoint(
         baichuan_lora_files,
         expected_lora_modules,
@@ -101,3 +105,4 @@ def test_lora_weights_mapping(baichuan_lora_files, ):
     )
     for name in lora_model.loras:
         assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
+        assert ".baichuan_layers." in name
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index c8c720ff0c776..c9f48402b0268 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -22,7 +22,7 @@ IMAGE_ASSETS = [
 
 # After fine-tuning with LoRA, all generated content should start begin `A`.
 EXPECTED_OUTPUT = [
-    "A stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
+    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
     "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
 ]
 
@@ -76,3 +76,7 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
     output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
     for i in range(len(EXPECTED_OUTPUT)):
         assert EXPECTED_OUTPUT[i].startswith(output1[i])
+
+    output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index f50db8e3b8e10..5c0e4e5cbc636 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -231,7 +231,8 @@ class LoRAModel(AdapterModel):
             with safetensors.safe_open(lora_tensor_path,
                                        framework="pt") as f:  # type: ignore
                 for lora_module in f.keys():  # noqa
-                    module_name, _, _ = parse_fine_tuned_lora_name(lora_module)
+                    module_name, _, _ = parse_fine_tuned_lora_name(
+                        lora_module, weights_mapper)
                     part_name = module_name.split(".")[-1]
                     if part_name not in expected_lora_modules:
                         unexpected_modules.append(module_name)
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 3a84a6ae1c02a..d72b7638d84af 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,4 +1,3 @@
-import copy
 import os
 import re
 from typing import List, Optional, Set, Tuple, Type, Union
@@ -32,7 +31,6 @@ from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.utils import WeightsMapper
-from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
 
@@ -112,36 +110,28 @@ def parse_fine_tuned_lora_name(
             is_bias whether the tensor is lora bias.
     """
 
-    w_mapper = None
-    if weights_mapper:
-        w_mapper = copy.deepcopy(weights_mapper)
-        # TODO: Currently only supports mapping for prefix, mapping for
-        # substr and subfix will be supported in the future.
-        for attr, mapping in [
-            ("orig_to_new_substr", w_mapper.orig_to_new_substr),
-            ("orig_to_new_suffix", w_mapper.orig_to_new_suffix),
-        ]:
-            if mapping:
-                print_warning_once(
-                    f"vLLM currently does not support mapping of LoRA weights "
-                    f"for {mapping}.")
-                setattr(w_mapper, attr, {})
+    # LoRA weight qualified name always starts with `base_model.model.`,
+    # so we remove the prefix `base_model.model.` to make the following
+    # mapping correctly.
+    if "base_model.model." in name:
+        name = name.replace("base_model.model.", "")
+        name = weights_mapper._map_name(name) if weights_mapper else name
+        # recover the prefix `base_model.model.`
+        name = "base_model.model." + name
 
-    mapper = (lambda name: w_mapper._map_name(name)
-              if w_mapper is not None else name)
     parts = name.split(".")
     if parts[-1] == "weight" and (parts[-2] == "lora_A"
                                   or parts[-2] == "lora_B"):
         new_name = ".".join(parts[2:-2])
-        return mapper(new_name), parts[-2] == "lora_A", False
+        return new_name, parts[-2] == "lora_A", False
 
     if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
         new_name = ".".join(parts[2:-1])
-        return mapper(new_name), parts[-1] == "lora_embedding_A", False
+        return new_name, parts[-1] == "lora_embedding_A", False
 
     if parts[-1] == "bias":
         new_name = ".".join(parts[2:-2])
-        return mapper(new_name), False, True
+        return new_name, False, True
 
     raise ValueError(f"{name} is unsupported LoRA weight")
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index ef8cc5886103e..10976fac23028 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -91,6 +91,8 @@ class WorkerLoRAManager(AbstractWorkerManager):
                         packed_modules_mapping[module])
                 else:
                     expected_lora_modules.append(module)
+
+            expected_lora_modules = list(set(expected_lora_modules))
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
             # For some models like Qwen2VL, we need to use hf_to_vllm_mapper

From 7492a362077ace26b4f0374a39ba5b0846962b87 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 26 Dec 2024 01:44:32 -0800
Subject: [PATCH 4/8] [Doc] Add `QVQ` and `QwQ` to the list of supported models
 (#11509)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 099e6c8f02815..85fba83147708 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -319,7 +319,7 @@ See [this page](#generative-models) for more information on how to use generativ
     - ✅︎
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
-    - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
+    - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
     - ✅︎
     - ✅︎
   * - :code:`Qwen2MoeForCausalLM`
@@ -710,7 +710,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`E+`
-    - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
+    - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
     -

From dcb1a944d4cf95b4a7b3522ddf970e6d3a28b8b5 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Thu, 26 Dec 2024 02:02:58 -0800
Subject: [PATCH 5/8] [V1] Adding min tokens/repetition/presence/frequence
 penalties to V1 sampler (#10681)

Signed-off-by: Sourashis Roy <sroy@roblox.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/engine/test_engine_core.py     |  38 +++
 tests/v1/sample/__init__.py             |   0
 tests/v1/sample/test_sampler.py         | 331 ++++++++++++++++++++++++
 tests/v1/worker/__init__.py             |   0
 tests/v1/worker/test_gpu_input_batch.py | 224 ++++++++++++++++
 vllm/model_executor/layers/sampler.py   |  51 +---
 vllm/model_executor/layers/utils.py     |  57 ++++
 vllm/v1/sample/metadata.py              |  12 +-
 vllm/v1/sample/sampler.py               |  65 ++++-
 vllm/v1/worker/gpu_input_batch.py       | 142 ++++++++++
 vllm/v1/worker/gpu_model_runner.py      |   8 +-
 11 files changed, 879 insertions(+), 49 deletions(-)
 create mode 100644 tests/v1/sample/__init__.py
 create mode 100644 tests/v1/sample/test_sampler.py
 create mode 100644 tests/v1/worker/__init__.py
 create mode 100644 tests/v1/worker/test_gpu_input_batch.py
 create mode 100644 vllm/model_executor/layers/utils.py

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index a61ec63a365b5..c529cd21f384b 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -139,3 +139,41 @@ def test_engine_core(monkeypatch):
         engine_core.abort_requests([req2.request_id, req0.request_id])
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 0
+
+
+def test_engine_core_advanced_sampling(monkeypatch):
+    """
+    A basic end-to-end test to verify that the engine functions correctly 
+    when additional sampling parameters, such as min_tokens and 
+    presence_penalty, are set.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        """Setup the EngineCore."""
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+
+        engine_core = EngineCore(vllm_config=vllm_config,
+                                 executor_class=executor_class,
+                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+        """Test basic request lifecycle."""
+        # First request.
+        request: EngineCoreRequest = make_request()
+        request.sampling_params = SamplingParams(
+            min_tokens=4,
+            presence_penalty=1.0,
+            frequency_penalty=1.0,
+            repetition_penalty=0.1,
+            stop_token_ids=[1001, 1002],
+        )
+        engine_core.add_request(request)
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+        # Loop through until they are all done.
+        while len(engine_core.step()) > 0:
+            pass
+
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
diff --git a/tests/v1/sample/__init__.py b/tests/v1/sample/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
new file mode 100644
index 0000000000000..d8d055805cbea
--- /dev/null
+++ b/tests/v1/sample/test_sampler.py
@@ -0,0 +1,331 @@
+from typing import List, Set, Tuple
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.utils import make_tensor_with_pad
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.sampler import Sampler
+
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def _create_penalty_tensor(batch_size: int, penalty_value: float,
+                           device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def _create_prompt_tokens_tensor(
+    prompt_token_ids: List[List[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+def _create_default_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> SamplingMetadata:
+    output_token_ids: List[List[int]] = []
+    prompt_token_ids: List[List[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+        prompt_token_ids.append(
+            np.random.randint(0,
+                              vocab_size,
+                              size=np.random.randint(
+                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size, ), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=torch.empty(batch_size, ),
+        top_k=torch.empty(batch_size, ),
+        no_top_p=True,
+        no_top_k=True,
+        generators={},
+        max_num_logprobs=VOCAB_SIZE,
+        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
+                                                      vocab_size, device),
+        output_token_ids=output_token_ids,
+        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        min_tokens=[],
+        stop_token_ids=[],
+    )
+    return fake_sampling_metadata
+
+
+def _generate_min_token_penalties_and_stop_tokens(
+    num_output_tokens: int, batch_size: int, vocab_size: int,
+    batch_indices_for_min_token_penalty: List[int]
+) -> Tuple[List[int], List[Set[int]]]:
+    """
+    Generates and returns a list of minimum token penalties (`min_tokens`) 
+    and a corresponding list of stop token IDs (`stop_token_ids`) for each 
+    batch.
+
+    If a batch index is included in `batch_indices_for_min_token_penalty`, 
+    a higher `min_tokens` value is assigned (within a randomized range), 
+    and a random set of stop token IDs is created. Otherwise, a lower 
+    `min_tokens` value is assigned, and the stop token IDs set is empty.   
+    """
+    stop_token_ids: List[Set[int]] = []
+    min_tokens: List[int] = []
+    for index in range(batch_size):
+        if index in batch_indices_for_min_token_penalty:
+            min_tokens.append(
+                np.random.randint(num_output_tokens + 1,
+                                  2 * num_output_tokens))
+            stop_token_ids.append(
+                set(
+                    np.random.randint(0, vocab_size - 1)
+                    for _ in range(np.random.randint(0, vocab_size))))
+
+        else:
+            min_tokens.append(np.random.randint(0, num_output_tokens))
+            stop_token_ids.append(set())
+    return (min_tokens, stop_token_ids)
+
+
+def _create_weighted_output_token_list(
+        batch_size: int,
+        vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]:
+    """
+    Creates an output token list where each token occurs a distinct 
+    number of times.
+
+    For each batch, a random subset of token IDs is selected from the
+    vocabulary. The selected tokens are then added to the output token
+    list, each with a different frequency.
+
+    Returns:
+        Tuple[List[List[int]], List[List[int]]]:
+            - The first element is the output token list, where each sublist 
+              corresponds to a batch and contains tokens with weighted 
+              frequencies.
+            - The second element is a list of distinct token IDs for each
+              batch, ordered by their frequency in the corresponding output
+              list.
+    """
+    output_token_ids: List[List[int]] = []
+    sorted_token_ids_in_output: List[List[int]] = []
+    for _ in range(batch_size):
+        distinct_token_ids = np.random.choice(vocab_size,
+                                              size=np.random.randint(1, 10),
+                                              replace=False).tolist()
+        sorted_token_ids_in_output.append(distinct_token_ids)
+        output_token_ids_for_batch = []
+        for index, token_id in enumerate(distinct_token_ids):
+            output_token_ids_for_batch.extend(
+                [token_id for _ in range(index + 1)])
+        output_token_ids.append(output_token_ids_for_batch)
+    return (output_token_ids, sorted_token_ids_in_output)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+def test_sampler_min_tokens_penalty(device: str, batch_size: int):
+    """
+    Tests that if the number of output tokens is less than 
+    SamplingParams.min_tokens then we will set the logits for
+    the stop token ids to -inf.
+    """
+    torch.set_default_device(device)
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    batch_indices_for_min_token_penalty = np.random.randint(
+        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
+    min_tokens, stop_token_ids = _generate_min_token_penalties_and_stop_tokens(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
+        batch_indices_for_min_token_penalty)
+    sampling_metadata.min_tokens = min_tokens
+    sampling_metadata.stop_token_ids = stop_token_ids
+    sampler = Sampler()
+    sampler_output = sampler(fake_logits, sampling_metadata)
+    for batch_idx in range(batch_size):
+        for vocab in range(VOCAB_SIZE):
+            # Verify that the logprobs for stop token ids is set
+            # to -inf.
+            logprob_index = torch.where(
+                sampler_output.logprob_token_ids[batch_idx] ==
+                vocab)[0].item()
+            if vocab in stop_token_ids[batch_idx]:
+                assert sampler_output.logprobs[batch_idx][
+                    logprob_index] == -float("inf")
+            else:
+                assert sampler_output.logprobs[batch_idx][
+                    logprob_index] != -float("inf")
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
+def test_sampler_presence_penalty(device: str, batch_size: int,
+                                  presence_penalty: float):
+    """
+    Test to verify that if presence penalty is enabled then tokens
+    are penalized as per their presence in the existing output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    output_token_ids = sampling_metadata.output_token_ids
+    sampling_metadata.presence_penalties = _create_penalty_tensor(
+        batch_size, presence_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    sampler_output = sampler(fake_logits, sampling_metadata)
+    for batch_idx in range(batch_size):
+        # The logprobs in the SamplerOutput are arranged in descending order.
+        # Since all tokens initially have the same logprobs, the non-penalized
+        # tokens will appear at the beginning, while the penalized tokens
+        #  will appear at the end of the list.
+        penalized_token_id = sampler_output.logprob_token_ids[batch_idx][
+            VOCAB_SIZE - 1]
+        penalized_log_prod = sampler_output.logprobs[batch_idx][VOCAB_SIZE - 1]
+        non_penalized_token_id = sampler_output.logprob_token_ids[batch_idx][0]
+        non_penalized_log_prod = sampler_output.logprobs[batch_idx][0]
+        assert non_penalized_log_prod > penalized_log_prod
+        if presence_penalty > 0:
+            # If `presence_penalty` is set to a value greater than 0, it
+            # indicates a preference for new tokens over those already
+            # present in the output.
+            # Verify that the penalized token ID exists in the output, while the
+            # non-penalized token ID does not.
+            assert penalized_token_id in output_token_ids[batch_idx]
+            assert non_penalized_token_id not in output_token_ids[batch_idx]
+        elif presence_penalty < 0:
+            # If `presence_penalty` is set to a value less than 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID exists in the output, while the penalized
+            # token ID does not.
+            assert non_penalized_token_id in output_token_ids[batch_idx]
+            assert penalized_token_id not in output_token_ids[batch_idx]
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
+def test_sampler_frequency_penalty(device: str, batch_size: int,
+                                   frequency_penalty: float):
+    """
+    Test to verify that if frequency penalty is enabled then tokens are
+    penalized as per their frequency of occurrence.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.frequency_penalties = _create_penalty_tensor(
+        batch_size, frequency_penalty, torch.device(device))
+    output_token_ids, sorted_token_ids_in_output = \
+        _create_weighted_output_token_list(batch_size, VOCAB_SIZE)
+    sampling_metadata.output_token_ids = output_token_ids
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    sampler_output = sampler(fake_logits, sampling_metadata)
+    for batch_idx in range(batch_size):
+        logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx]
+        non_penalized_token_id = logprobs_token_ids[0]
+        penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1]
+        distinct_sorted_token_ids_in_output = \
+            sorted_token_ids_in_output[batch_idx]
+        most_frequent_token_id = distinct_sorted_token_ids_in_output[
+            len(distinct_sorted_token_ids_in_output) - 1]
+        if frequency_penalty > 0:
+            # If `frequency_penalty` is set to > 0, it indicates
+            # a preference for new tokens over existing ones. Verify that the
+            # non-penalized token ID is not present in the output, while the
+            # most penalized token is the one that occurs most frequently in
+            # the output.
+            assert non_penalized_token_id \
+                not in distinct_sorted_token_ids_in_output
+            assert penalized_token_id == most_frequent_token_id
+        elif frequency_penalty < 0:
+            # If `frequency_penalty` is set to < 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID is the one that occurs most frequently
+            # in the output, while the penalized token ID is one that has not
+            # yet appeared.
+            assert non_penalized_token_id == most_frequent_token_id
+            assert penalized_token_id \
+                not in distinct_sorted_token_ids_in_output
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
+def test_sampler_repetition_penalty(device: str, batch_size: int,
+                                    repetition_penalty: float):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens 
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.repetition_penalties = _create_penalty_tensor(
+        batch_size, repetition_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    sampler_output = sampler(fake_logits, sampling_metadata)
+    for batch_idx in range(batch_size):
+        logprobs_token_ids = sampler_output.logprob_token_ids[batch_idx]
+        non_penalized_token_id = logprobs_token_ids[0]
+        penalized_token_id = logprobs_token_ids[VOCAB_SIZE - 1]
+        prompt_tokens = sampling_metadata.prompt_token_ids[
+            batch_idx][:].tolist()
+        output_tokens = sampling_metadata.output_token_ids[batch_idx]
+        if repetition_penalty > 1.0:
+            # If `repetition_penalty` > 1.0, verify that the non-penalized
+            # token ID has not been seen before, while the penalized token ID
+            # exists either in the prompt or the output.
+            assert (non_penalized_token_id not in prompt_tokens and \
+                non_penalized_token_id not in output_tokens)
+            assert (penalized_token_id  in prompt_tokens or \
+                penalized_token_id in output_tokens)
+        elif repetition_penalty < 1.0:
+            # If `repetition_penalty` < 1.0, verify that the penalized
+            # token ID has not been seen before, while the non-penalized
+            # token ID exists either in the prompt or the output.
+            assert (penalized_token_id not in prompt_tokens and \
+                penalized_token_id not in output_tokens)
+            assert (non_penalized_token_id  in prompt_tokens or \
+                non_penalized_token_id in output_tokens)
diff --git a/tests/v1/worker/__init__.py b/tests/v1/worker/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
new file mode 100644
index 0000000000000..694ce81ff6e22
--- /dev/null
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -0,0 +1,224 @@
+from typing import Dict, List, Set, Tuple
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+MAX_PROMPT_SIZE = 100
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _remove_requests(
+        input_batch: InputBatch, batch_size: int,
+        reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]:
+    """
+    Remove some requests randomly from the batch and returns a Tuple
+    of 1) set of request removed 2) indices of the requests removed
+    ordered in descending order
+    """
+
+    num_reqs_to_remove = np.random.randint(0, batch_size)
+    req_indices_to_remove: Set[int] = set()
+    for _ in range(num_reqs_to_remove):
+        req_index_to_remove = np.random.randint(0, batch_size)
+        req_indices_to_remove.add(req_index_to_remove)
+
+    req_indices_to_remove_list = list(req_indices_to_remove)
+    req_indices_to_remove_list.sort(reverse=True)
+    req_ids_to_remove: Set[str] = set()
+    for index in req_indices_to_remove:
+        input_batch.remove_request(reqs[index].req_id)
+        req_ids_to_remove.add(reqs[index].req_id)
+    return (req_ids_to_remove, req_indices_to_remove_list)
+
+
+def _construct_expected_sampling_metadata(
+        reqs: List[CachedRequestState], req_ids_retained: Set[int],
+        req_id_index_in_input_batch: Dict[str, int],
+        device: torch.device) -> SamplingMetadata:
+    """
+    Constructs and returns the expected SamplingMetadata for this
+    batch.
+    """
+    num_reqs = len(req_ids_retained)
+    output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
+    prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
+    presence_penalties = [0.0 for _ in range(num_reqs)]
+    frequency_penalties = [0.0 for _ in range(num_reqs)]
+    repetition_penalties = [1.0 for _ in range(num_reqs)]
+    top_k = [0 for _ in range(num_reqs)]
+    top_p = [0.0 for _ in range(num_reqs)]
+    temperature = [0.0 for _ in range(num_reqs)]
+    stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)]
+    min_tokens = [0 for _ in range(num_reqs)]
+    for req in reqs:
+        if req.req_id not in req_ids_retained:
+            continue
+        index_in_input_batch = req_id_index_in_input_batch[req.req_id]
+        output_token_ids[index_in_input_batch] = req.output_token_ids
+        prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
+        presence_penalties[
+            index_in_input_batch] = req.sampling_params.presence_penalty
+        frequency_penalties[
+            index_in_input_batch] = req.sampling_params.frequency_penalty
+        repetition_penalties[
+            index_in_input_batch] = req.sampling_params.repetition_penalty
+        top_k[index_in_input_batch] = req.sampling_params.top_k
+        top_p[index_in_input_batch] = req.sampling_params.top_p
+        temperature[index_in_input_batch] = req.sampling_params.temperature
+        stop_token_ids[
+            index_in_input_batch] = req.sampling_params.all_stop_token_ids
+        min_tokens[index_in_input_batch] = req.sampling_params.min_tokens
+
+
+    return SamplingMetadata(
+        temperature=torch.tensor(temperature, dtype=torch.float, device=device),
+        all_greedy=False,
+        all_random=True,
+        top_p=torch.tensor(top_p, dtype=torch.float, device=device),
+        top_k=torch.tensor(top_k, dtype=torch.int, device=device),
+        no_top_p=all(x == 1.0 for x in top_p),
+        no_top_k=all(x == 0 for x in top_k),
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids= make_tensor_with_pad(
+            prompt_token_ids,
+            pad=VOCAB_SIZE,
+            device=torch.device(device),
+            dtype=torch.int64,
+        ),
+        frequency_penalties=torch.tensor(
+            frequency_penalties, dtype=torch.float,
+            device=device),
+        presence_penalties=torch.tensor(
+            presence_penalties, dtype=torch.float,
+            device=device),
+        repetition_penalties=torch.tensor(
+            repetition_penalties, dtype=torch.float,
+            device=device),
+        output_token_ids=output_token_ids,
+        min_tokens=min_tokens,
+        stop_token_ids=stop_token_ids,
+        no_penalties=(all(x ==0 for x in presence_penalties) and \
+            all(x ==0 for x in frequency_penalties) and \
+                all(x ==1 for x in repetition_penalties))
+    )
+
+
+def _create_sampling_params():
+    return SamplingParams(top_k=np.random.randint(1, 10),
+                          top_p=np.random.uniform(0.0, 1.0),
+                          presence_penalty=np.random.uniform(-2.0, 2.0),
+                          repetition_penalty=np.random.uniform(0.0, 2.0),
+                          frequency_penalty=np.random.uniform(-2.0, 2.0),
+                          min_tokens=np.random.randint(1, 10),
+                          stop_token_ids=[
+                              np.random.randint(0, VOCAB_SIZE)
+                              for _ in range(np.random.randint(10))
+                          ])
+
+
+def _construct_cached_request_state(req_id_suffix: int):
+    prompt_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, MAX_PROMPT_SIZE))
+    ]
+    output_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
+    ]
+    return CachedRequestState(req_id=f"req_id_{req_id_suffix}",
+                              prompt_token_ids=prompt_token_ids,
+                              prompt=None,
+                              sampling_params=_create_sampling_params(),
+                              mm_inputs=[],
+                              mm_positions=[],
+                              block_ids=[],
+                              generator=None,
+                              num_computed_tokens=len(output_token_ids),
+                              output_token_ids=output_token_ids)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
+def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
+
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
+    """
+    input_batch: InputBatch = InputBatch(max_num_reqs=batch_size,
+                                         max_model_len=1024,
+                                         max_num_blocks_per_req=10,
+                                         device=torch.device(device),
+                                         pin_memory=is_pin_memory_available(),
+                                         vocab_size=1024)
+    reqs: List[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        input_batch.add_request(req, req_index)
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
+
+    # Remove some requests
+    req_ids_to_remove, req_indices_to_remove = _remove_requests(
+        input_batch, batch_size, reqs)
+    req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove
+
+    # Compact the input batch
+    input_batch.condense(req_indices_to_remove)
+
+    # Generate the sampling metadata
+    sampling_metadata = input_batch.make_sampling_metadata(
+        req_id_output_token_ids, skip_copy=False)
+
+    # Create expected output.
+    expected_sampling_metadata = _construct_expected_sampling_metadata(
+        reqs,
+        req_ids_retained,
+        input_batch.req_id_to_index,
+        device=torch.device(device))
+
+    # Assert the actual and expected output.
+    assert torch.allclose(expected_sampling_metadata.temperature,
+                          sampling_metadata.temperature)
+    assert torch.allclose(expected_sampling_metadata.top_p,
+                          sampling_metadata.top_p)
+    assert torch.allclose(expected_sampling_metadata.top_k,
+                          sampling_metadata.top_k)
+    assert torch.allclose(expected_sampling_metadata.frequency_penalties,
+                          sampling_metadata.frequency_penalties)
+    assert torch.allclose(expected_sampling_metadata.presence_penalties,
+                          sampling_metadata.presence_penalties)
+    assert torch.allclose(expected_sampling_metadata.repetition_penalties,
+                          sampling_metadata.repetition_penalties)
+    assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
+                          sampling_metadata.prompt_token_ids)
+    assert (expected_sampling_metadata.output_token_ids ==
+            sampling_metadata.output_token_ids)
+    assert (
+        expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens)
+    assert (expected_sampling_metadata.stop_token_ids ==
+            sampling_metadata.stop_token_ids)
+    assert (expected_sampling_metadata.no_penalties ==
+            sampling_metadata.no_penalties)
+    assert (expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p)
+    assert (expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k)
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c10efefea5471..c2d12c466ba45 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -11,6 +11,7 @@ import torch
 import torch.nn as nn
 
 import vllm.envs as envs
+from vllm.model_executor.layers.utils import apply_penalties
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
@@ -258,11 +259,11 @@ class Sampler(nn.Module):
 
         # Apply presence and frequency penalties.
         if do_penalties:
-            logits = _apply_penalties(logits, sampling_tensors.prompt_tokens,
-                                      sampling_tensors.output_tokens,
-                                      sampling_tensors.presence_penalties,
-                                      sampling_tensors.frequency_penalties,
-                                      sampling_tensors.repetition_penalties)
+            logits = apply_penalties(logits, sampling_tensors.prompt_tokens,
+                                     sampling_tensors.output_tokens,
+                                     sampling_tensors.presence_penalties,
+                                     sampling_tensors.frequency_penalties,
+                                     sampling_tensors.repetition_penalties)
 
         # Use float32 to apply temperature scaling.
         # Use in-place division to avoid creating a new tensor.
@@ -336,23 +337,6 @@ class Sampler(nn.Module):
         return self.should_modify_greedy_probs_inplace
 
 
-def _get_bin_counts_and_mask(
-    tokens: torch.Tensor,
-    vocab_size: int,
-    num_seqs: int,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    # Compute the bin counts for the tokens.
-    # vocab_size + 1 for padding.
-    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
-                             dtype=torch.long,
-                             device=tokens.device)
-    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
-    bin_counts = bin_counts[:, :vocab_size]
-    mask = bin_counts > 0
-
-    return bin_counts, mask
-
-
 def _apply_min_tokens_penalty(
     logits: torch.Tensor,
     sampling_metadata: SamplingMetadata,
@@ -400,29 +384,6 @@ def _apply_min_tokens_penalty(
     return logits
 
 
-def _apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
-                     output_tokens_tensor: torch.Tensor,
-                     presence_penalties: torch.Tensor,
-                     frequency_penalties: torch.Tensor,
-                     repetition_penalties: torch.Tensor) -> torch.Tensor:
-    num_seqs, vocab_size = logits.shape
-    _, prompt_mask = _get_bin_counts_and_mask(prompt_tokens_tensor, vocab_size,
-                                              num_seqs)
-    output_bin_counts, output_mask = _get_bin_counts_and_mask(
-        output_tokens_tensor, vocab_size, num_seqs)
-
-    repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size)
-    repetition_penalties[~(prompt_mask | output_mask)] = 1.0
-    logits = torch.where(logits > 0, logits / repetition_penalties,
-                         logits * repetition_penalties)
-
-    # We follow the definition in OpenAI API.
-    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
-    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
-    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
-    return logits
-
-
 def _apply_top_k_top_p(
     logits: torch.Tensor,
     p: torch.Tensor,
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
new file mode 100644
index 0000000000000..f6f34cd49d953
--- /dev/null
+++ b/vllm/model_executor/layers/utils.py
@@ -0,0 +1,57 @@
+"""Utility methods for model layers."""
+from typing import Tuple
+
+import torch
+
+
+def get_token_bin_counts_and_mask(
+    tokens: torch.Tensor,
+    vocab_size: int,
+    num_seqs: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Compute the bin counts for the tokens.
+    # vocab_size + 1 for padding.
+    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
+                             dtype=torch.long,
+                             device=tokens.device)
+    bin_counts.scatter_add_(1, tokens, torch.ones_like(tokens))
+    bin_counts = bin_counts[:, :vocab_size]
+    mask = bin_counts > 0
+
+    return bin_counts, mask
+
+
+def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
+                    output_tokens_tensor: torch.Tensor,
+                    presence_penalties: torch.Tensor,
+                    frequency_penalties: torch.Tensor,
+                    repetition_penalties: torch.Tensor) -> torch.Tensor:
+    """
+    Applies penalties in place to the logits tensor
+    logits : The input logits tensor of shape [num_seqs, vocab_size]
+    prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts 
+        are padded to the maximum prompt length within the batch using 
+        `vocab_size` as the padding value. The value `vocab_size` is used 
+        for padding because it does not correspond to any valid token ID 
+        in the vocabulary.
+    output_tokens_tensor: The output tokens tensor.
+    presence_penalties: The presence penalties of shape (num_seqs, )
+    frequency_penalties: The frequency penalties of shape (num_seqs, )
+    repetition_penalties: The repetition penalties of shape (num_seqs, )
+    """
+    num_seqs, vocab_size = logits.shape
+    _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor,
+                                                   vocab_size, num_seqs)
+    output_bin_counts, output_mask = get_token_bin_counts_and_mask(
+        output_tokens_tensor, vocab_size, num_seqs)
+    repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat(
+        1, vocab_size)
+    logits[logits > 0] /= torch.where(prompt_mask | output_mask,
+                                      repetition_penalties, 1.0)[logits > 0]
+    logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
+                                       repetition_penalties, 1.0)[logits <= 0]
+    # We follow the definition in OpenAI API.
+    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
+    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
+    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
+    return logits
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9ef36f2e6b212..d60f7eb5d76f9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict
+from typing import Dict, List, Optional, Set
 
 import torch
 
@@ -19,3 +19,13 @@ class SamplingMetadata:
     generators: Dict[int, torch.Generator]
 
     max_num_logprobs: int
+
+    no_penalties: bool
+    prompt_token_ids: Optional[torch.Tensor]
+    frequency_penalties: torch.Tensor
+    presence_penalties: torch.Tensor
+    repetition_penalties: torch.Tensor
+
+    output_token_ids: List[List[int]]
+    min_tokens: List[int]
+    stop_token_ids: List[Set[int]]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index d1a755be01ff7..82470fb2610f8 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,9 +1,11 @@
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict
+from typing import Dict, List, Set, Tuple
 
 import torch
 import torch.nn as nn
 
+from vllm.model_executor.layers.utils import apply_penalties
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -17,9 +19,18 @@ class Sampler(nn.Module):
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+        _apply_min_token_penalties(logits, sampling_metadata.output_token_ids,
+                                   sampling_metadata.stop_token_ids,
+                                   sampling_metadata.min_tokens)
+        if not sampling_metadata.no_penalties:
+            assert sampling_metadata.prompt_token_ids is not None
+            _apply_penalties(logits, sampling_metadata.prompt_token_ids,
+                             sampling_metadata.presence_penalties,
+                             sampling_metadata.frequency_penalties,
+                             sampling_metadata.repetition_penalties,
+                             sampling_metadata.output_token_ids)
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
         logits = self.apply_top_k_top_p(logits, sampling_metadata)
-
         probs = self.get_probs(logits)
         sampled = self.sample(probs, sampling_metadata)
         # Use int32 to reduce the tensor size.
@@ -157,3 +168,53 @@ def _apply_top_k_top_p(
     # Re-sort the probabilities.
     logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
     return logits
+
+
+def _apply_min_token_penalties(logits: torch.Tensor,
+                               output_token_ids: List[List[int]],
+                               stop_token_ids: List[Set[int]],
+                               min_tokens: List[int]):
+    """
+    Applies minimum token penalty by setting the logits of the stop tokens
+    to -inf.
+    """
+    min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
+    for index, min_token in enumerate(min_tokens):
+        if (len(output_token_ids[index]) < min_token):
+            for stop_token_id in stop_token_ids[index]:
+                min_tokens_logits_to_penalize.append((index, stop_token_id))
+    if min_tokens_logits_to_penalize:
+        logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
+
+
+def _apply_penalties(logits: torch.Tensor, prompt_token_ids: torch.Tensor,
+                     presence_penalties: torch.Tensor,
+                     frequency_penalties: torch.Tensor,
+                     repetition_penalties: torch.Tensor,
+                     output_token_ids: List[List[int]]):
+    """
+    Applies presence, frequency and repetition penalties to the logits.
+    """
+    _, vocab_size = logits.shape
+    output_tokens_t = _convert_to_tensors(output_token_ids, vocab_size,
+                                          logits.device)
+    return apply_penalties(logits, prompt_token_ids, output_tokens_t,
+                           presence_penalties, frequency_penalties,
+                           repetition_penalties)
+
+
+def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
+                        device: torch.device) -> torch.Tensor:
+    """
+    Convert the different list data structures to tensors.
+    """
+    output_tokens_tensor = make_tensor_with_pad(
+        output_token_ids,
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        pad=vocab_size,
+        device="cpu",
+        dtype=torch.int64,
+        pin_memory=is_pin_memory_available(),
+    )
+    return output_tokens_tensor.to(device, non_blocking=True)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 5c113c74778df..6c4d300ec6efe 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -43,12 +43,14 @@ class InputBatch:
         max_num_blocks_per_req: int,
         device: torch.device,
         pin_memory: bool,
+        vocab_size: int,
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_blocks_per_req = max_num_blocks_per_req
         self.device = device
         self.pin_memory = pin_memory
+        self.vocab_size = vocab_size
 
         self.req_ids: List[Optional[str]] = [None] * max_num_reqs
         self.req_id_to_index: Dict[str, int] = {}
@@ -63,6 +65,7 @@ class InputBatch:
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
 
         # Attention-related.
         self.block_table = torch.zeros(
@@ -110,6 +113,50 @@ class InputBatch:
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: Set[str] = set()
 
+        # Frequency penalty related data structures
+        self.frequency_penalties = torch.empty((max_num_reqs, ),
+                                               dtype=torch.float,
+                                               device=device)
+        self.frequency_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.frequency_penalties_cpu = \
+            self.frequency_penalties_cpu_tensor.numpy()
+        self.frequency_penalties_reqs: Set[str] = set()
+
+        # Presence penalty related data structures
+        self.presence_penalties = torch.empty((max_num_reqs, ),
+                                              dtype=torch.float,
+                                              device=device)
+        self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ),
+                                                         dtype=torch.float,
+                                                         device="cpu",
+                                                         pin_memory=pin_memory)
+        self.presence_penalties_cpu = \
+            self.presence_penalties_cpu_tensor.numpy()
+        self.presence_penalties_reqs: Set[str] = set()
+
+        # Repetition penalty related data structures
+        self.repetition_penalties = torch.empty((max_num_reqs, ),
+                                                dtype=torch.float,
+                                                device=device)
+        self.repetition_penalties_cpu_tensor = torch.empty(
+            (max_num_reqs, ),
+            dtype=torch.float,
+            device="cpu",
+            pin_memory=pin_memory)
+        self.repetition_penalties_cpu = \
+            self.repetition_penalties_cpu_tensor.numpy()
+        self.repetition_penalties_reqs: Set[str] = set()
+
+        self.min_tokens: List[int] = [0] * max_num_reqs
+        self.stop_token_ids: List[Set[int]] = [
+            set() for _ in range(max_num_reqs)
+        ]
+        self.prompt_token_ids: Optional[torch.Tensor] = None
+
         # req_index -> generator
         # NOTE(woosuk): The indices of the requests that do not have their own
         # generator should not be included in the dictionary.
@@ -133,6 +180,7 @@ class InputBatch:
 
         # Copy the prompt token ids and output token ids.
         num_prompt_tokens = len(request.prompt_token_ids)
+        self.num_prompt_tokens[req_index] = num_prompt_tokens
         self.token_ids_cpu[
             req_index, :num_prompt_tokens] = request.prompt_token_ids
         start_idx = num_prompt_tokens
@@ -157,6 +205,20 @@ class InputBatch:
         self.top_k_cpu[req_index] = sampling_params.top_k
         if sampling_params.top_k > 0:
             self.top_k_reqs.add(req_id)
+        self.frequency_penalties_cpu[req_index] = \
+            sampling_params.frequency_penalty
+        if sampling_params.frequency_penalty != 0.0:
+            self.frequency_penalties_reqs.add(req_id)
+        self.presence_penalties_cpu[req_index] = \
+            sampling_params.presence_penalty
+        if sampling_params.presence_penalty != 0.0:
+            self.presence_penalties_reqs.add(req_id)
+        self.repetition_penalties_cpu[req_index] = \
+            sampling_params.repetition_penalty
+        if sampling_params.repetition_penalty != 1.0:
+            self.repetition_penalties_reqs.add(req_id)
+        self.min_tokens[req_index] = sampling_params.min_tokens
+        self.stop_token_ids[req_index] = sampling_params.all_stop_token_ids
 
         # NOTE(woosuk): self.generators should not include the requests that
         # do not have their own generator.
@@ -179,6 +241,9 @@ class InputBatch:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
+        self.frequency_penalties_reqs.discard(req_id)
+        self.presence_penalties_reqs.discard(req_id)
+        self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
@@ -191,6 +256,9 @@ class InputBatch:
         self.random_reqs.clear()
         self.top_p_reqs.clear()
         self.top_k_reqs.clear()
+        self.frequency_penalties_reqs.clear()
+        self.presence_penalties_reqs.clear()
+        self.repetition_penalties_reqs.clear()
         self.generators.clear()
         self.num_logprobs.clear()
         self.prompt_logprob_reqs.clear()
@@ -224,6 +292,8 @@ class InputBatch:
             # block_table_cpu.
             self.token_ids_cpu[empty_index] = self.token_ids_cpu[
                 last_req_index]
+            self.num_prompt_tokens[empty_index] = \
+                self.num_prompt_tokens[last_req_index]
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
             self.block_table_cpu[empty_index] = self.block_table_cpu[
@@ -232,6 +302,15 @@ class InputBatch:
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
             self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
+            self.frequency_penalties_cpu[empty_index] = \
+                self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[empty_index] = \
+                self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[empty_index] = \
+                self.repetition_penalties_cpu[last_req_index]
+            self.min_tokens[empty_index] = self.min_tokens[last_req_index]
+            self.stop_token_ids[empty_index] = \
+                self.stop_token_ids[last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
@@ -241,6 +320,7 @@ class InputBatch:
 
     def make_sampling_metadata(
         self,
+        req_id_output_token_ids: Dict[str, List[int]],
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -250,6 +330,37 @@ class InputBatch:
                 self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
+            if not self.no_penalties:
+                # Since syncing these tensors is expensive only copy them
+                # if necessary i.e. if there are requests which require
+                # penalties to be applied during sampling.
+                self.frequency_penalties[:self.num_reqs].copy_(
+                    self.frequency_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                self.presence_penalties[:self.num_reqs].copy_(
+                    self.presence_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                self.repetition_penalties[:self.num_reqs].copy_(
+                    self.repetition_penalties_cpu_tensor[:self.num_reqs],
+                    non_blocking=True)
+                # The prompt tokens are used only for applying penalties during
+                # the sampling process. Hence copy these tensors only when
+                # there are requests which need penalties to be applied.
+                self.prompt_token_ids = self._make_prompt_token_ids_tensor()
+
+        output_token_ids: List[List[int]] = []
+
+        for req_id in self.req_ids[:self.num_reqs]:
+            assert req_id is not None
+            # Currently we create a tensor for output_token_ids from scratch
+            # at each step. However, for the penalties computation what we
+            # need is stats about the token ids present in the output. This
+            # stats can be maintained incrementally instead of computing it
+            # from scratch at each step.
+            # TODO - Replace this with incremental update to output token
+            # statistics.
+            output_token_ids.append(req_id_output_token_ids[req_id])
+
         return SamplingMetadata(
             temperature=self.temperature[:self.num_reqs],
             all_greedy=self.all_greedy,
@@ -260,8 +371,33 @@ class InputBatch:
             no_top_k=self.no_top_k,
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
+            prompt_token_ids=self.prompt_token_ids,
+            frequency_penalties=self.frequency_penalties[:self.num_reqs],
+            presence_penalties=self.presence_penalties[:self.num_reqs],
+            repetition_penalties=self.repetition_penalties[:self.num_reqs],
+            output_token_ids=output_token_ids,
+            min_tokens=self.min_tokens[:self.num_reqs],
+            stop_token_ids=self.stop_token_ids[:self.num_reqs],
+            no_penalties=self.no_penalties,
         )
 
+    def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+        max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+        prompt_token_ids_cpu_tensor = torch.empty(
+            (self.num_reqs, max_prompt_len),
+            device="cpu",
+            dtype=torch.int64,
+            pin_memory=self.pin_memory)
+        prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
+        prompt_token_ids[:] = (
+            self.token_ids_cpu[:self.num_reqs, :max_prompt_len])
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        for i in range(self.num_reqs):
+            prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+        return prompt_token_ids_cpu_tensor.to(device=self.device,
+                                              non_blocking=True)
+
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)
@@ -282,6 +418,12 @@ class InputBatch:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
+    @property
+    def no_penalties(self) -> bool:
+        return (len(self.presence_penalties_reqs) == 0
+                and len(self.frequency_penalties_reqs) == 0
+                and len(self.repetition_penalties_reqs) == 0)
+
     @property
     def max_num_logprobs(self) -> int:
         return max(self.num_logprobs.values()) if self.num_logprobs else 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ace62d8978bea..509771b7e2e5a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -105,6 +105,7 @@ class GPUModelRunner:
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
+            vocab_size=model_config.get_vocab_size(),
         )
 
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
@@ -383,7 +384,12 @@ class GPUModelRunner:
                 or scheduler_output.scheduled_resumed_reqs):
             skip_copy = False
         # Create the sampling metadata.
-        sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
+        req_id_output_token_ids: Dict[str, List[int]] = \
+            {req_id: req.output_token_ids \
+                for req_id, req in self.requests.items()}
+
+        sampling_metadata = self.input_batch.make_sampling_metadata(
+            req_id_output_token_ids, skip_copy)
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):

From f57ee5650dd402c6147980824c6936c96cfa59fe Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 26 Dec 2024 21:12:05 +0800
Subject: [PATCH 6/8] [Model]  Modify MolmoForCausalLM MLP  (#11510)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/molmo.py | 42 ++++++++++++++++-------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 63a25137f8aa9..8938f62d0c494 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -464,24 +464,27 @@ class MolmoAttention(nn.Module):
 class MolmoMLP(nn.Module):
     """Molmo's LLM mlp."""
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        input_dim: Optional[int] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
+    def __init__(self,
+                 config: PretrainedConfig,
+                 input_dim: Optional[int] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 proj_name: str = "gate_up_proj") -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size // 2
 
-        # Feed-forward input projection.
-        self.gate_up_proj = MergedColumnParallelLinear(
-            input_dim or self.hidden_size,
-            [self.intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-        )
-
+        # Molmo's LLM proj weights are already merged into the disk, while
+        # image_projector proj is separate. If the same proj_name were used, it
+        # would create ambiguity and make it difficult to support BNB and LoRA.
+        self.proj_name = proj_name
+        setattr(
+            self, proj_name,
+            MergedColumnParallelLinear(
+                input_dim or self.hidden_size,
+                [self.intermediate_size] * 2,
+                bias=False,
+                quant_config=quant_config,
+            ))
         # Activation function.
         self.act_fn = SiluAndMul()
 
@@ -497,7 +500,7 @@ class MolmoMLP(nn.Module):
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        gate_up, _ = self.gate_up_proj(x)
+        gate_up, _ = getattr(self, self.proj_name)(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
         return x
@@ -520,7 +523,9 @@ class MolmoDecoderLayer(nn.Module):
                                         prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = MolmoMLP(config, quant_config=quant_config)
+        self.mlp = MolmoMLP(config,
+                            quant_config=quant_config,
+                            proj_name="gate_up_proj")
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
@@ -616,6 +621,7 @@ class MolmoVisionBackbone(nn.Module):
             config,
             input_dim=vision_config.image_emb_dim,
             quant_config=quant_config,
+            proj_name="merged_linear",
         )
 
         image_dim = vision_config.image_emb_dim * len(self.vit_layers)
@@ -714,8 +720,8 @@ class MolmoVisionBackbone(nn.Module):
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
+            ("merged_linear", "gate_proj", 0),
+            ("merged_linear", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()

From eec906d8114cd786315e49ab7f5a3093d1896880 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 26 Dec 2024 21:12:51 +0800
Subject: [PATCH 7/8] [Misc] Add placeholder module (#11501)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/tensorizer_loader/test_tensorizer.py    |  9 ++-
 vllm/assets/audio.py                          | 19 +++---
 vllm/assets/base.py                           |  7 +--
 vllm/assets/image.py                          |  3 +-
 vllm/assets/video.py                          |  9 +--
 vllm/config.py                                | 10 +--
 vllm/model_executor/model_loader/loader.py    | 11 +---
 .../model_executor/model_loader/tensorizer.py | 25 ++++----
 .../model_loader/weight_utils.py              | 17 ++---
 vllm/multimodal/audio.py                      | 24 ++-----
 vllm/multimodal/utils.py                      | 26 +++++---
 vllm/multimodal/video.py                      | 13 +---
 vllm/transformers_utils/s3_utils.py           |  7 ++-
 vllm/utils.py                                 | 63 +++++++++++++++++++
 14 files changed, 143 insertions(+), 100 deletions(-)

diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index edd079bc7a389..0b0792b6b845f 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -9,7 +9,6 @@ import openai
 import pytest
 import torch
 from huggingface_hub import snapshot_download
-from tensorizer import EncryptionParams
 
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -23,12 +22,18 @@ from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          serialize_vllm_model,
                                                          tensorize_vllm_model)
 # yapf: enable
-from vllm.utils import import_from_path
+from vllm.utils import PlaceholderModule, import_from_path
 
 from ..conftest import VllmRunner
 from ..utils import VLLM_PATH, RemoteOpenAIServer
 from .conftest import retry_until_skip
 
+try:
+    from tensorizer import EncryptionParams
+except ImportError:
+    tensorizer = PlaceholderModule("tensorizer")  # type: ignore[assignment]
+    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
+
 EXAMPLES_PATH = VLLM_PATH / "examples"
 
 prompts = [
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index 49bb6aeee90bc..9033644e3264a 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,11 +1,17 @@
 from dataclasses import dataclass
-from typing import Literal, Tuple
+from typing import Literal
 from urllib.parse import urljoin
 
-import librosa
-import numpy as np
+import numpy.typing as npt
 
-from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
+from vllm.utils import PlaceholderModule
+
+from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 
 ASSET_DIR = "multimodal_asset"
 
@@ -15,8 +21,7 @@ class AudioAsset:
     name: Literal["winning_call", "mary_had_lamb"]
 
     @property
-    def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
-
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
         audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
                                             s3_prefix=ASSET_DIR)
         y, sr = librosa.load(audio_path, sr=None)
@@ -25,4 +30,4 @@ class AudioAsset:
 
     @property
     def url(self) -> str:
-        return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
+        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
diff --git a/vllm/assets/base.py b/vllm/assets/base.py
index f97e8c218f65b..249173141106c 100644
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -4,9 +4,8 @@ from typing import Optional
 
 import vllm.envs as envs
 from vllm.connections import global_http_connection
-from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
 
-vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
+VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
 
 
 def get_cache_dir() -> Path:
@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
         if s3_prefix is not None:
             filename = s3_prefix + "/" + filename
         global_http_connection.download_file(
-            f"{vLLM_S3_BUCKET_URL}/{filename}",
+            f"{VLLM_S3_BUCKET_URL}/{filename}",
             asset_path,
-            timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
 
     return asset_path
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 389ecd5c869bc..cb831cb0b5bb4 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -4,7 +4,7 @@ from typing import Literal
 import torch
 from PIL import Image
 
-from vllm.assets.base import get_vllm_public_assets
+from .base import get_vllm_public_assets
 
 VLM_IMAGES_DIR = "vision_model_images"
 
@@ -15,7 +15,6 @@ class ImageAsset:
 
     @property
     def pil_image(self) -> Image.Image:
-
         image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
                                             s3_prefix=VLM_IMAGES_DIR)
         return Image.open(image_path)
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index e6779935bad17..eca2ccc54482c 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,13 +2,13 @@ from dataclasses import dataclass
 from functools import lru_cache
 from typing import List, Literal
 
+import cv2
 import numpy as np
 import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.video import (sample_frames_from_video,
-                                   try_import_video_packages)
+from vllm.multimodal.video import sample_frames_from_video
 
 from .base import get_cache_dir
 
@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
     Download and open an image from huggingface
     repo: raushan-testing-hf/videos-test
     """
-    video_directory = get_cache_dir() / "video-eample-data"
+    video_directory = get_cache_dir() / "video-example-data"
     video_directory.mkdir(parents=True, exist_ok=True)
 
     video_path = video_directory / filename
@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
 
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
         raise ValueError(f"Could not open video file {path}")
@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
 def video_to_pil_images_list(path: str,
                              num_frames: int = -1) -> List[Image.Image]:
-    cv2, _ = try_import_video_packages()
     frames = video_to_ndarrays(path, num_frames)
     return [
         Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
diff --git a/vllm/config.py b/vllm/config.py
index 17602bda15c69..de8ba029ddc23 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -29,6 +29,7 @@ from vllm.transformers_utils.config import (
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
     try_get_generation_config, uses_mrope)
+from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, print_warning_once, random_uuid,
@@ -372,15 +373,6 @@ class ModelConfig:
 
         """
         if is_s3(model) or is_s3(tokenizer):
-            try:
-                from vllm.transformers_utils.s3_utils import S3Model
-            except ImportError as err:
-                raise ImportError(
-                    "Please install Run:ai optional dependency "
-                    "to use the S3 capabilities. "
-                    "You can install it with: pip install vllm[runai]"
-                ) from err
-
             if is_s3(model):
                 self.s3_model = S3Model()
                 self.s3_model.pull_files(model, allow_pattern=["*config.json"])
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 24e554e6060ab..f2d9293b31a83 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -48,6 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import (
     runai_safetensors_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.transformers_utils.s3_utils import glob as s3_glob
 from vllm.transformers_utils.utils import is_s3
 from vllm.utils import is_pin_memory_available
 
@@ -1269,16 +1270,6 @@ class RunaiModelStreamerLoader(BaseModelLoader):
 
         If the model is not local, it will be downloaded."""
         is_s3_path = is_s3(model_name_or_path)
-        if is_s3_path:
-            try:
-                from vllm.transformers_utils.s3_utils import glob as s3_glob
-            except ImportError as err:
-                raise ImportError(
-                    "Please install Run:ai optional dependency "
-                    "to use the S3 capabilities. "
-                    "You can install it with: pip install vllm[runai]"
-                ) from err
-
         is_local = os.path.isdir(model_name_or_path)
         safetensors_pattern = "*.safetensors"
         index_file = SAFE_WEIGHTS_INDEX_NAME
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 87f3fcb5cae00..8b929f299c8d8 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -19,9 +19,7 @@ from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.utils import FlexibleArgumentParser
-
-tensorizer_error_msg = None
+from vllm.utils import FlexibleArgumentParser, PlaceholderModule
 
 try:
     from tensorizer import (DecryptionParams, EncryptionParams,
@@ -34,8 +32,19 @@ try:
         open_stream,
         mode=mode,
     ) for mode in ("rb", "wb+"))
-except ImportError as e:
-    tensorizer_error_msg = str(e)
+except ImportError:
+    tensorizer = PlaceholderModule("tensorizer")
+    DecryptionParams = tensorizer.placeholder_attr("DecryptionParams")
+    EncryptionParams = tensorizer.placeholder_attr("EncryptionParams")
+    TensorDeserializer = tensorizer.placeholder_attr("TensorDeserializer")
+    TensorSerializer = tensorizer.placeholder_attr("TensorSerializer")
+    open_stream = tensorizer.placeholder_attr("stream_io.open_stream")
+    convert_bytes = tensorizer.placeholder_attr("utils.convert_bytes")
+    get_mem_usage = tensorizer.placeholder_attr("utils.get_mem_usage")
+    no_init_or_tensor = tensorizer.placeholder_attr("utils.no_init_or_tensor")
+
+    _read_stream = tensorizer.placeholder_attr("_read_stream")
+    _write_stream = tensorizer.placeholder_attr("_write_stream")
 
 __all__ = [
     'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
@@ -267,12 +276,6 @@ class TensorizerAgent:
     """
 
     def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
-        if tensorizer_error_msg is not None:
-            raise ImportError(
-                "Tensorizer is not installed. Please install tensorizer "
-                "to use this feature with `pip install vllm[tensorizer]`. "
-                "Error message: {}".format(tensorizer_error_msg))
-
         self.tensorizer_config = tensorizer_config
         self.tensorizer_args = (
             self.tensorizer_config._construct_tensorizer_args())
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f2a9e7e2687cb..8aa0c98df70d2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -25,7 +25,15 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig,
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
 from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
+from vllm.utils import PlaceholderModule, print_warning_once
+
+try:
+    from runai_model_streamer import SafetensorsStreamer
+except ImportError:
+    runai_model_streamer = PlaceholderModule(
+        "runai_model_streamer")  # type: ignore[assignment]
+    SafetensorsStreamer = runai_model_streamer.placeholder_attr(
+        "SafetensorsStreamer")
 
 logger = init_logger(__name__)
 
@@ -414,13 +422,6 @@ def runai_safetensors_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
-    try:
-        from runai_model_streamer import SafetensorsStreamer
-    except ImportError as err:
-        raise ImportError(
-            "Please install Run:ai optional dependency."
-            "You can install it with: pip install vllm[runai]") from err
-
     enable_tqdm = not torch.distributed.is_initialized(
     ) or torch.distributed.get_rank() == 0
     with SafetensorsStreamer() as streamer:
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 314d21b746236..ed3bb82bf0aaa 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,13 +1,17 @@
-from typing import Any
-
 import numpy as np
 import numpy.typing as npt
 
 from vllm.inputs.registry import InputContext
+from vllm.utils import PlaceholderModule
 
 from .base import MultiModalPlugin
 from .inputs import AudioItem, MultiModalData, MultiModalKwargs
 
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
 
 class AudioPlugin(MultiModalPlugin):
     """Plugin for audio data."""
@@ -28,26 +32,10 @@ class AudioPlugin(MultiModalPlugin):
             "There is no default maximum multimodal tokens")
 
 
-def try_import_audio_packages() -> tuple[Any, Any]:
-    try:
-        import librosa
-        import soundfile
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[audio] for audio support.") from exc
-    return librosa, soundfile
-
-
 def resample_audio(
     audio: npt.NDArray[np.floating],
     *,
     orig_sr: float,
     target_sr: float,
 ) -> npt.NDArray[np.floating]:
-    try:
-        import librosa
-    except ImportError as exc:
-        msg = "Please install vllm[audio] for audio support."
-        raise ImportError(msg) from exc
-
     return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 1cb9036bdfda3..a49da2bdee972 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -13,10 +13,24 @@ import vllm.envs as envs
 from vllm.connections import global_http_connection
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.utils import PlaceholderModule
 
-from .audio import try_import_audio_packages
 from .inputs import MultiModalDataDict, PlaceholderRange
-from .video import try_import_video_packages
+
+try:
+    import decord
+except ImportError:
+    decord = PlaceholderModule("decord")  # type: ignore[assignment]
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+try:
+    import soundfile
+except ImportError:
+    soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
 
 logger = init_logger(__name__)
 
@@ -128,8 +142,6 @@ async def async_fetch_image(image_url: str,
 
 
 def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray:
-    _, decord = try_import_video_packages()
-
     video_path = BytesIO(b)
     vr = decord.VideoReader(video_path, num_threads=1)
     total_frame_num = len(vr)
@@ -204,8 +216,6 @@ def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
     """
     Load audio from a URL.
     """
-    librosa, _ = try_import_audio_packages()
-
     if audio_url.startswith("http"):
         audio_bytes = global_http_connection.get_bytes(
             audio_url,
@@ -226,8 +236,6 @@ async def async_fetch_audio(
     """
     Asynchronously fetch audio from a URL.
     """
-    librosa, _ = try_import_audio_packages()
-
     if audio_url.startswith("http"):
         audio_bytes = await global_http_connection.async_get_bytes(
             audio_url,
@@ -286,8 +294,6 @@ def encode_audio_base64(
     sampling_rate: int,
 ) -> str:
     """Encode audio as base64."""
-    _, soundfile = try_import_audio_packages()
-
     buffered = BytesIO()
     soundfile.write(buffered, audio, sampling_rate, format="WAV")
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index bfcdef70718bc..c4be100562703 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,6 +1,7 @@
 from functools import lru_cache
 from typing import TYPE_CHECKING, Any, Dict, Optional
 
+import cv2
 import numpy as np
 import numpy.typing as npt
 
@@ -78,19 +79,7 @@ class VideoPlugin(ImagePlugin):
         return 4096
 
 
-def try_import_video_packages() -> tuple[Any, Any]:
-    try:
-        import cv2
-        import decord
-    except ImportError as exc:
-        raise ImportError(
-            "Please install vllm[video] for video support.") from exc
-    return cv2, decord
-
-
 def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
-
     num_frames, _, _, channels = frames.shape
     new_height, new_width = size
     resized_frames = np.empty((num_frames, new_height, new_width, channels),
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 6f63dab74d696..6ae68161bbd97 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -6,7 +6,12 @@ import tempfile
 from pathlib import Path
 from typing import Optional
 
-import boto3
+from vllm.utils import PlaceholderModule
+
+try:
+    import boto3
+except ImportError:
+    boto3 = PlaceholderModule("boto3")  # type: ignore[assignment]
 
 
 def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
diff --git a/vllm/utils.py b/vllm/utils.py
index 49e532540d7ee..7d290dcb7dad0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -6,10 +6,12 @@ import datetime
 import enum
 import gc
 import getpass
+import importlib.metadata
 import importlib.util
 import inspect
 import ipaddress
 import os
+import re
 import signal
 import socket
 import subprocess
@@ -1550,6 +1552,67 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
     return module
 
 
+@lru_cache(maxsize=None)
+def get_vllm_optional_dependencies():
+    metadata = importlib.metadata.metadata("vllm")
+    requirements = metadata.get_all("Requires-Dist", [])
+    extras = metadata.get_all("Provides-Extra", [])
+
+    return {
+        extra: [
+            re.split(r";|>=|<=|==", req)[0] for req in requirements
+            if req.endswith(f'extra == "{extra}"')
+        ]
+        for extra in extras
+    }
+
+
+@dataclass(frozen=True)
+class PlaceholderModule:
+    """
+    A placeholder object to use when a module does not exist.
+
+    This enables more informative errors when trying to access attributes
+    of a module that does not exists.
+    """
+    name: str
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self, attr_path)
+
+    def __getattr__(self, key: str):
+        name = self.name
+
+        try:
+            importlib.import_module(self.name)
+        except ImportError as exc:
+            for extra, names in get_vllm_optional_dependencies().items():
+                if name in names:
+                    msg = f"Please install vllm[{extra}] for {extra} support"
+                    raise ImportError(msg) from exc
+
+            raise exc
+
+        raise AssertionError("PlaceholderModule should not be used "
+                             "when the original module can be imported")
+
+
+@dataclass(frozen=True)
+class _PlaceholderModuleAttr:
+    module: PlaceholderModule
+    attr_path: str
+
+    def placeholder_attr(self, attr_path: str):
+        return _PlaceholderModuleAttr(self.module,
+                                      f"{self.attr_path}.{attr_path}")
+
+    def __getattr__(self, key: str):
+        getattr(self.module, f"{self.attr_path}.{key}")
+
+        raise AssertionError("PlaceholderModule should not be used "
+                             "when the original module can be imported")
+
+
 # create a library to hold the custom op
 vllm_lib = Library("vllm", "FRAGMENT")  # noqa
 

From b85a977822c4216430c5a27a2fc47c93277e4b29 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 27 Dec 2024 01:31:29 +0800
Subject: [PATCH 8/8] [Doc] Add video example to openai client for multimodal
 (#11521)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/usage/multimodal_inputs.md        | 52 ++++++++++++-
 ...i_chat_completion_client_for_multimodal.py | 73 +++++++++++++++++--
 2 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md
index 82a3f3b8909a1..4f45a9f448cf0 100644
--- a/docs/source/usage/multimodal_inputs.md
+++ b/docs/source/usage/multimodal_inputs.md
@@ -294,12 +294,58 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 ### Video
 
-Instead of {code}`image_url`, you can pass a video file via {code}`video_url`.
+Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
 
-You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference.
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model-len 8192
+```
+
+Then, you can use the OpenAI client as follows:
+```python
+from openai import OpenAI
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+
+## Use video url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_completion_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print("Chat completion output from image url:", result)
+```
+
+Full example: <gh-file:examples/openai_chat_completion_client_for_multimodal.py>
 
 ````{note}
-By default, the timeout for fetching videos through HTTP URL url is `30` seconds.
+By default, the timeout for fetching videos through HTTP URL is `30` seconds.
 You can override this by setting the environment variable:
 
 ```console
diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py
index 6a160fd70423f..213d075542e81 100644
--- a/examples/openai_chat_completion_client_for_multimodal.py
+++ b/examples/openai_chat_completion_client_for_multimodal.py
@@ -18,7 +18,6 @@ import base64
 import requests
 from openai import OpenAI
 
-from vllm.assets.audio import AudioAsset
 from vllm.utils import FlexibleArgumentParser
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
@@ -151,8 +150,66 @@ def run_multi_image() -> None:
     print("Chat completion output:", result)
 
 
+# Video input inference
+def run_video() -> None:
+    video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
+    video_base64 = encode_base64_content_from_url(video_url)
+
+    ## Use video url in the payload
+    chat_completion_from_url = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": video_url
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_url.choices[0].message.content
+    print("Chat completion output from image url:", result)
+
+    ## Use base64 encoded video in the payload
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "What's in this video?"
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": f"data:video/mp4;base64,{video_base64}"
+                    },
+                },
+            ],
+        }],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from base64 encoded image:", result)
+
+
 # Audio input inference
 def run_audio() -> None:
+    from vllm.assets.audio import AudioAsset
+
     audio_url = AudioAsset("winning_call").url
     audio_base64 = encode_base64_content_from_url(audio_url)
 
@@ -240,6 +297,7 @@ example_function_map = {
     "text-only": run_text_only,
     "single-image": run_single_image,
     "multi-image": run_multi_image,
+    "video": run_video,
     "audio": run_audio,
 }
 
@@ -253,12 +311,11 @@ if __name__ == "__main__":
     parser = FlexibleArgumentParser(
         description='Demo on using OpenAI client for online inference with '
         'multimodal language models served with vLLM.')
-    parser.add_argument(
-        '--chat-type',
-        '-c',
-        type=str,
-        default="single-image",
-        choices=["text-only", "single-image", "multi-image", "audio"],
-        help='Conversation type with multimodal data.')
+    parser.add_argument('--chat-type',
+                        '-c',
+                        type=str,
+                        default="single-image",
+                        choices=list(example_function_map.keys()),
+                        help='Conversation type with multimodal data.')
     args = parser.parse_args()
     main(args)