From 08b2d845d6261309bfdb46933f872eebe4e2bb31 Mon Sep 17 00:00:00 2001
From: Farzad Abdolhosseini <farzad.abdolhosseini@gmail.com>
Date: Mon, 10 Feb 2025 14:02:48 -0800
Subject: [PATCH 01/43] [Model] Ultravox Model: Support v0.5 Release (#12912)

Signed-off-by: Farzad Abdolhosseini <farzad@fixie.ai>
---
 docs/source/models/supported_models.md        |  2 +-
 docs/source/serving/multimodal_inputs.md      |  4 +--
 examples/offline_inference/audio_language.py  |  4 +--
 ...i_chat_completion_client_for_multimodal.py |  2 +-
 tests/distributed/test_pipeline_parallel.py   |  4 +--
 tests/entrypoints/openai/test_audio.py        |  2 +-
 tests/entrypoints/test_chat_utils.py          |  2 +-
 .../audio_language/test_ultravox.py           |  2 +-
 .../multimodal/processing/test_common.py      |  2 +-
 tests/models/registry.py                      |  2 +-
 vllm/model_executor/models/ultravox.py        | 26 ++++++++++++-------
 vllm/transformers_utils/configs/ultravox.py   |  6 +++++
 12 files changed, 36 insertions(+), 22 deletions(-)
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 91e6c42d52611..55b3f52356cd0 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -856,7 +856,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `UltravoxModel`
   * Ultravox
   * T + A<sup>E+</sup>
-  * `fixie-ai/ultravox-v0_3`
+  * `fixie-ai/ultravox-v0_5-llama-3_2-1b`
   * ✅︎
   * ✅︎
   * ✅︎
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 217b531e83788..ade59e3773839 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -359,12 +359,12 @@ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 ### Audio
 
 Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
-Here is a simple example using Ultravox-v0.3.
+Here is a simple example using Ultravox-v0.5-1B.
 
 First, launch the OpenAI-compatible server:
 
 ```bash
-vllm serve fixie-ai/ultravox-v0_3
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
 ```
 
 Then, you can use the OpenAI client as follows:
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 707ca9f878961..3e3034a02f0f1 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -24,9 +24,9 @@ question_per_audio_count = {
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-# Ultravox 0.3
+# Ultravox 0.5-1B
 def run_ultravox(question: str, audio_count: int):
-    model_name = "fixie-ai/ultravox-v0_3"
+    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index d5f798a8dae62..ecfcf05a90d16 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -12,7 +12,7 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
     --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 
 (audio inference with Ultravox)
-vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
 """
 import base64
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5b6741d74efc0..5d7cb9e408909 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -215,7 +215,7 @@ MULTIMODAL_MODELS = {
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
     "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
-    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(trust_remote_code=True),
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
     # [Encoder-decoder]
     # TODO: Implement PP
     # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
@@ -234,7 +234,7 @@ TEST_MODELS = [
     # [MULTIMODAL GENERATION]
     "OpenGVLab/InternVL2-1B",
     "microsoft/Phi-3-vision-128k-instruct",
-    "fixie-ai/ultravox-v0_3",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     # [LANGUAGE GENERATION - HYBRID ARCH]
     "ai21labs/Jamba-tiny-dev",
 ]
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 3459f24834dbc..fe7299a48e6f6 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -11,7 +11,7 @@ from vllm.multimodal.utils import encode_audio_base64, fetch_audio
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "fixie-ai/ultravox-v0_3"
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 TEST_AUDIO_URLS = [
     AudioAsset("winning_call").url,
 ]
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 5c469007af23e..c52fa905c80b3 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -21,7 +21,7 @@ from ..utils import VLLM_PATH
 EXAMPLES_DIR = VLLM_PATH / "examples"
 
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
-ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3"
+ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index fe9361d126120..d1f643a8fdb73 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -15,7 +15,7 @@ from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
 from ...utils import check_logprobs_close
 
-MODEL_NAME = "fixie-ai/ultravox-v0_3"
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
 AudioTuple = Tuple[np.ndarray, int]
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a56a9e2beef22..6244056c7474a 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -164,7 +164,7 @@ def _test_processing_correctness(
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
-    "fixie-ai/ultravox-v0_3",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3fd94b89c8a60..66b7d3c2e77b5 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -267,7 +267,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
-    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
                                      trust_remote_code=True),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 9da0682cfa866..063997a14a66f 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -258,27 +258,35 @@ class UltravoxProjector(nn.Module):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self._pad_and_stack = StackAudioFrames(config.stack_factor)
-        dim = config.audio_config.hidden_size * config.stack_factor
-        self.ln_pre = RMSNorm(dim)
-        self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False)
-        dim = self.hidden_dim
+        dim_in = config.audio_config.hidden_size * config.stack_factor
+        self.ln_pre = RMSNorm(dim_in)
+        self.linear_1 = nn.Linear(dim_in, self.hidden_dim, bias=False)
+        dim_mid = self.hidden_dim
 
         if config.projector_act == "swiglu":
             self.act = MulAndSilu()
-            dim = dim // 2
+            dim_mid = dim_mid // 2
         else:
             self.act = get_act_fn(config.projector_act)
 
-        self.linear_2 = nn.Linear(dim,
-                                  config.text_config.hidden_size,
-                                  bias=False)
-        self.ln_post = RMSNorm(config.text_config.hidden_size)
+        dim_out = config.text_config.hidden_size
+        self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
+
+        # Ultravox v0.4.1 and below use layer_norm after the second linear layer
+        # while v0.5.0 and above uses layer_norm after the first linear layer.
+        if config.projector_ln_mid:
+            self.ln_mid: nn.Module = RMSNorm(dim_mid)
+            self.ln_post = nn.Identity()
+        else:
+            self.ln_mid = nn.Identity()
+            self.ln_post = RMSNorm(dim_out)
 
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
         hidden_states = self.linear_1(audio_features)
         hidden_states = self.act(hidden_states)
+        hidden_states = self.ln_mid(hidden_states)
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.ln_post(hidden_states)
         return hidden_states
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 99715ba6d0b09..6b2765db94e78 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -37,6 +37,10 @@ class UltravoxConfig(transformers.PretrainedConfig):
             The LoRA configuration for finetuning the text model.
         audio_model_lora_config (`LoraConfigSimplified`, *optional*):
             The LoRA configuration for finetuning the audio model.
+        projector_ln_mid (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization at the middle of the
+            projector or at the end. Versions v0.4.1 and below
+            use `False`, but v0.5 and above use `True`.
     """
 
     model_type = "ultravox"
@@ -56,6 +60,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         projector_act: str = "swiglu",
         text_model_lora_config: Optional[Dict[str, Any]] = None,
         audio_model_lora_config: Optional[Dict[str, Any]] = None,
+        projector_ln_mid: bool = False,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -68,6 +73,7 @@ class UltravoxConfig(transformers.PretrainedConfig):
         self.stack_factor = stack_factor
         self.norm_init = norm_init
         self.projector_act = projector_act
+        self.projector_ln_mid = projector_ln_mid
 
         if text_model_id is not None:
             # Avoid circular import

From 91e876750eace8e899ab25cd5d93fc365906c07b Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 10 Feb 2025 18:06:16 -0800
Subject: [PATCH 02/43] [misc] Fix setup.py condition to avoid AMD from being
 mistaken with CPU (#13022)

Signed-off-by: kevin <kevin@anyscale.com>
---
 setup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 3e2adadf6704f..27e5aab760f9a 100755
--- a/setup.py
+++ b/setup.py
@@ -48,8 +48,9 @@ elif not (sys.platform.startswith("linux")
         "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
 elif (sys.platform.startswith("linux") and torch.version.cuda is None
-      and os.getenv("VLLM_TARGET_DEVICE") is None):
-    # if cuda is not available and VLLM_TARGET_DEVICE is not set,
+      and os.getenv("VLLM_TARGET_DEVICE") is None
+      and torch.version.hip is None):
+    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
     # fallback to cpu
     VLLM_TARGET_DEVICE = "cpu"
 

From 2ff4857678044407a959398178a7a04a9530919a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Feb 2025 18:10:06 -0800
Subject: [PATCH 03/43] [V1][Minor] Move scheduler outputs to a separate file
 (#13062)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler.py          |  89 +-----------------------
 vllm/v1/core/scheduler_output.py   | 108 +++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py |   2 +-
 vllm/v1/worker/gpu_worker.py       |   3 +-
 4 files changed, 113 insertions(+), 89 deletions(-)
 create mode 100644 vllm/v1/core/scheduler_output.py

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 1aa34ee386027..1c54914d182ba 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,26 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections import deque
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
-                    Tuple, Union)
+from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
+                                           SchedulerOutput)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
-if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.base import PlaceholderRange
-
 logger = init_logger(__name__)
 
 
@@ -600,80 +594,3 @@ class Scheduler:
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
         )
-
-
-@dataclass
-class NewRequestData:
-
-    req_id: str
-    prompt_token_ids: List[int]
-    prompt: Optional[str]
-    mm_inputs: List["MultiModalKwargs"]
-    mm_hashes: List[str]
-    mm_positions: List["PlaceholderRange"]
-    sampling_params: SamplingParams
-    block_ids: List[int]
-    num_computed_tokens: int
-    lora_request: Optional[LoRARequest]
-
-    @classmethod
-    def from_request(
-        cls,
-        request: Request,
-        block_ids: List[int],
-        num_computed_tokens: int,
-    ) -> "NewRequestData":
-        return cls(
-            req_id=request.request_id,
-            prompt_token_ids=request.prompt_token_ids,
-            prompt=request.prompt,
-            mm_inputs=request.mm_inputs,
-            mm_hashes=request.mm_hashes,
-            mm_positions=request.mm_positions,
-            sampling_params=request.sampling_params,
-            block_ids=block_ids,
-            num_computed_tokens=num_computed_tokens,
-            lora_request=request.lora_request,
-        )
-
-
-@dataclass
-class CachedRequestData:
-
-    req_id: str
-    # If resumed_from_preemption is False, new_block_ids will be appended to
-    # the request's block IDs. If True, new_block_ids will be used as the
-    # request's block IDs instead of appending to the existing block IDs.
-    resumed_from_preemption: bool
-    new_block_ids: List[int]
-    num_computed_tokens: int
-
-    @classmethod
-    def from_request(
-        cls,
-        request: Request,
-        resumed_from_preemption: bool,
-        new_block_ids: List[int],
-        num_computed_tokens: int,
-    ) -> "CachedRequestData":
-        return cls(
-            req_id=request.request_id,
-            resumed_from_preemption=resumed_from_preemption,
-            new_block_ids=new_block_ids,
-            num_computed_tokens=num_computed_tokens,
-        )
-
-
-@dataclass
-class SchedulerOutput:
-
-    scheduled_new_reqs: List[NewRequestData]
-    scheduled_cached_reqs: List[CachedRequestData]
-
-    num_scheduled_tokens: Dict[str, int]
-    total_num_scheduled_tokens: int
-    scheduled_encoder_inputs: Dict[str, List[int]]
-    num_common_prefix_blocks: int
-
-    finished_req_ids: Set[str]
-    free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
new file mode 100644
index 0000000000000..990b3dd0ed780
--- /dev/null
+++ b/vllm/v1/core/scheduler_output.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
+
+if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.base import PlaceholderRange
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.request import Request
+
+
+@dataclass
+class NewRequestData:
+
+    req_id: str
+    prompt_token_ids: List[int]
+    prompt: Optional[str]
+    mm_inputs: List["MultiModalKwargs"]
+    mm_hashes: List[str]
+    mm_positions: List["PlaceholderRange"]
+    sampling_params: "SamplingParams"
+    block_ids: List[int]
+    num_computed_tokens: int
+    lora_request: Optional["LoRARequest"]
+
+    @classmethod
+    def from_request(
+        cls,
+        request: "Request",
+        block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "NewRequestData":
+        return cls(
+            req_id=request.request_id,
+            prompt_token_ids=request.prompt_token_ids,
+            prompt=request.prompt,
+            mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
+            mm_positions=request.mm_positions,
+            sampling_params=request.sampling_params,
+            block_ids=block_ids,
+            num_computed_tokens=num_computed_tokens,
+            lora_request=request.lora_request,
+        )
+
+
+@dataclass
+class CachedRequestData:
+
+    req_id: str
+    # If resumed_from_preemption is False, new_block_ids will be appended to
+    # the request's block IDs. If True, new_block_ids will be used as the
+    # request's block IDs instead of appending to the existing block IDs.
+    resumed_from_preemption: bool
+    new_block_ids: List[int]
+    num_computed_tokens: int
+
+    @classmethod
+    def from_request(
+        cls,
+        request: "Request",
+        resumed_from_preemption: bool,
+        new_block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "CachedRequestData":
+        return cls(
+            req_id=request.request_id,
+            resumed_from_preemption=resumed_from_preemption,
+            new_block_ids=new_block_ids,
+            num_computed_tokens=num_computed_tokens,
+        )
+
+
+@dataclass
+class SchedulerOutput:
+
+    # List of the requests that are scheduled for the first time.
+    # We cache the request's data in each worker process, so that we don't
+    # need to re-send it every scheduling step.
+    scheduled_new_reqs: List[NewRequestData]
+    # List of the requests that have been scheduled before.
+    # Since the request's data is already cached in the worker processes,
+    # we only send the diff to minimize the communication cost.
+    scheduled_cached_reqs: List[CachedRequestData]
+
+    # req_id -> num_scheduled_tokens
+    # Number of tokens scheduled for each request.
+    num_scheduled_tokens: Dict[str, int]
+    # Total number of tokens scheduled for all requests.
+    # Equal to sum(num_scheduled_tokens.values())
+    total_num_scheduled_tokens: int
+    # req_id -> encoder input indices that need processing.
+    # E.g., if a request has [0, 1], it could mean the vision encoder needs
+    # to process that the request's 0-th and 1-th images in the current step.
+    scheduled_encoder_inputs: Dict[str, List[int]]
+    # Number of common prefix blocks for all requests.
+    # This can be used for cascade attention.
+    num_common_prefix_blocks: int
+
+    # Request IDs that are finished in between the previous and the current
+    # steps. This is used to notify the workers about the finished requests
+    # so that they can free the cached states for those requests.
+    finished_req_ids: Set[str]
+    # List of (req_id, encoder_input_index) tuples.
+    # Used to free the encoder cache.
+    free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fdbca70bda711..9b1eab613bf7b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -36,7 +36,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.core.scheduler_output import SchedulerOutput
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0adb69073397c..ad53f90b86652 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -18,7 +18,6 @@ from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.utils import GiB_bytes
-from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
@@ -26,7 +25,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.core.scheduler_output import SchedulerOutput
 
 
 class Worker:

From 2c0f58203c111bcc331f931664400acfc94cb9bc Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 10 Feb 2025 18:24:29 -0800
Subject: [PATCH 04/43] [Docs] Annouce Meta Meetup (#13065)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index f04acf09cff3d..f22a1f9c5c80e 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
+We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
+
+---
+
 *Latest News* 🔥
 
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

From cb080f32e38e87beda897d0602bf6a0d0c79d00f Mon Sep 17 00:00:00 2001
From: Florian Greinacher <florian.greinacher@siemens.com>
Date: Tue, 11 Feb 2025 04:33:33 +0100
Subject: [PATCH 05/43] [Bugfix] Support missing tool parameters in mistral
 tokenizer (#12884)

Signed-off-by: Florian Greinacher <florian.greinacher@siemens.com>
---
 tests/tokenization/test_mistral_tokenizer.py  | 50 ++++++++++++++++
 vllm/transformers_utils/tokenizers/mistral.py | 57 ++++++++++++-------
 2 files changed, 88 insertions(+), 19 deletions(-)
 create mode 100644 tests/tokenization/test_mistral_tokenizer.py

diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
new file mode 100644
index 0000000000000..03e1f1fadd731
--- /dev/null
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from mistral_common.protocol.instruct.messages import UserMessage
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.protocol.instruct.tool_calls import Function, Tool
+
+from vllm.transformers_utils.tokenizers.mistral import (
+    make_mistral_chat_completion_request)
+
+
+# yapf: enable
+@pytest.mark.parametrize(
+    "openai_request,expected_mistral_request",
+    [(
+        {
+            "messages": [{
+                "role": "user",
+                "content": "What is the current local date and time?",
+            }],
+            "tools": [{
+                "type": "function",
+                "function": {
+                    "description": "Fetch the current local date and time.",
+                    "name": "get_current_time",
+                },
+            }],
+        },
+        ChatCompletionRequest(
+            messages=[
+                UserMessage(content="What is the current local date and time?")
+            ],
+            tools=[
+                Tool(
+                    type="function",
+                    function=Function(
+                        name="get_current_time",
+                        description="Fetch the current local date and time.",
+                        parameters={},
+                    ),
+                )
+            ],
+        ),
+    )],
+)
+def test_make_mistral_chat_completion_request(openai_request,
+                                              expected_mistral_request):
+    assert (make_mistral_chat_completion_request(
+        openai_request["messages"],
+        openai_request["tools"]) == expected_mistral_request)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 8d96fcd278e67..f08923e7401f3 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -104,6 +104,42 @@ def find_tokenizer_file(files: List[str]):
     return matched_files[0]
 
 
+def make_mistral_chat_completion_request(
+        messages: List["ChatCompletionMessageParam"],
+        tools: Optional[List[Dict[str,
+                                  Any]]] = None) -> "ChatCompletionRequest":
+    last_message = cast(Dict[str, Any], messages[-1])
+    if last_message["role"] == "assistant":
+        last_message["prefix"] = True
+
+        last_message = cast(Dict[str, Any], messages[-1])
+        if last_message["role"] == "assistant":
+            last_message["prefix"] = True
+
+    # mistral-common requires AssistantMessage content to be string [1].
+    #
+    # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
+    for message in messages:
+        if message.get("role") == "assistant":
+            content = message.get("content")
+            if isinstance(content, list):
+                content = "\n".join(chunk.get("text") for chunk in content)
+                message["content"] = content
+
+    # The Mistral client, in comparison to the OpenAI client, requires the
+    # "parameters" dict to be present, even if it's empty.
+    if tools:
+        for function in [
+                tool["function"] for tool in tools
+                if tool["type"] == "function"
+        ]:
+            function.setdefault("parameters", {})
+
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    return ChatCompletionRequest(messages=messages,
+                                 tools=tools)  # type: ignore[type-var]
+
+
 class MistralTokenizer:
 
     def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
@@ -283,27 +319,10 @@ class MistralTokenizer:
 
     def apply_chat_template(self,
                             messages: List["ChatCompletionMessageParam"],
-                            tools: Optional[Dict[str, Any]] = None,
+                            tools: Optional[List[Dict[str, Any]]] = None,
                             **kwargs) -> List[int]:
 
-        last_message = cast(Dict[str, Any], messages[-1])
-        if last_message["role"] == "assistant":
-            last_message["prefix"] = True
-
-        from mistral_common.protocol.instruct.request import (
-            ChatCompletionRequest)
-
-        # mistral-common requires AssistantMessage content to be string [1].
-        #
-        # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
-        for message in messages:
-            if message.get("role") == "assistant":
-                content = message.get("content")
-                if isinstance(content, list):
-                    content = "\n".join(chunk.get("text") for chunk in content)
-                    message["content"] = content
-        request = ChatCompletionRequest(messages=messages,
-                                        tools=tools)  # type: ignore[type-var]
+        request = make_mistral_chat_completion_request(messages, tools)
         encoded = self.mistral.encode_chat_completion(request)
 
         # encode-decode to get clean prompt

From 58047c6f0410fc7a86b64c88c092a246984b2342 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Feb 2025 21:25:30 -0800
Subject: [PATCH 06/43] [Benchmark] Add BurstGPT to benchmark_serving (#13063)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 benchmarks/README.md            |  8 +++++++
 benchmarks/benchmark_serving.py | 40 ++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 890a2525bcfef..367ef93457f9f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -19,3 +19,11 @@ mkdir coco -p
 wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
 unzip coco/train2017.zip -d coco/
 ```
+
+# Downloading the BurstGPT dataset
+
+You can download the BurstGPT v1.1 dataset by running:
+
+```bash
+wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
+```
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1044bef594173..0c892384236bc 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -38,6 +38,7 @@ from datetime import datetime
 from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
 
 import numpy as np
+import pandas as pd
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                   RequestFuncOutput)
 from datasets import load_dataset
@@ -131,6 +132,35 @@ def sample_sharegpt_requests(
     return filtered_dataset
 
 
+def sample_burstgpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    random_seed: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, int, int, None]]:
+    df = pd.read_csv(dataset_path)
+    gpt4_df = df[df["Model"] == "GPT-4"]
+    # Remove the failed requests (i.e., response length is 0)
+    gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+    # Randomly sample num_requests from the dataset
+    if num_requests <= len(gpt4_df):
+        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
+    else:
+        gpt4_df = gpt4_df.sample(n=num_requests,
+                                 random_state=random_seed,
+                                 replace=True)
+    # Convert the dataframe to a list of tuples
+    dataset = gpt4_df.values.tolist()
+    input_requests = []
+    for i in range(num_requests):
+        input_len = int(dataset[i][2])
+        output_len = int(dataset[i][3])
+        prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
+                                   for j in range(input_len)])
+        input_requests.append((prompt, input_len, output_len, None))
+    return input_requests
+
+
 def sample_sonnet_requests(
     dataset_path: str,
     num_requests: int,
@@ -830,6 +860,14 @@ def main(args: argparse.Namespace):
             fixed_output_len=args.sharegpt_output_len,
         )
 
+    elif args.dataset_name == "burstgpt":
+        input_requests = sample_burstgpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            random_seed=args.seed,
+            tokenizer=tokenizer,
+        )
+
     elif args.dataset_name == "sonnet":
         # Do not format the prompt, pass to message directly
         if args.backend == "openai-chat":
@@ -995,7 +1033,7 @@ if __name__ == "__main__":
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "sonnet", "random", "hf"],
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument("--dataset-path",

From c320ca8edd5c4c19e7581703e428dd566b068756 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 11 Feb 2025 02:25:25 -0500
Subject: [PATCH 07/43] [Core] Don't do platform detection at import time
 (#12933)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/executor/executor_base.py | 6 +++---
 vllm/executor/ray_utils.py     | 6 +++---
 vllm/platforms/cuda.py         | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index fb76276bb4b34..242690f8e1b8f 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -8,11 +8,11 @@ from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
 import torch.nn as nn
 from typing_extensions import TypeVar
 
+import vllm.platforms
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.utils import make_async
@@ -108,8 +108,8 @@ class ExecutorBase(ABC):
         """
         # NOTE: This is logged in the executor because there can be >1 workers.
         logger.info("# %s blocks: %d, # CPU blocks: %d",
-                    current_platform.dispatch_key, num_gpu_blocks,
-                    num_cpu_blocks)
+                    vllm.platforms.current_platform.dispatch_key,
+                    num_gpu_blocks, num_cpu_blocks)
         max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
                            self.model_config.max_model_len)
         logger.info("Maximum concurrency for %s tokens per request: %.2fx",
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7b30155971a6d..33c0a25803ca6 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -7,10 +7,10 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import msgspec
 
+import vllm.platforms
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -54,10 +54,10 @@ try:
 
         def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             node_id = ray.get_runtime_context().get_node_id()
-            device_key = current_platform.ray_device_key
+            device_key = vllm.platforms.current_platform.ray_device_key
             if not device_key:
                 raise RuntimeError("current platform %s does not support ray.",
-                                   current_platform.device_name)
+                                   vllm.platforms.current_platform.device_name)
             gpu_ids = ray.get_runtime_context().get_accelerator_ids(
             )[device_key]
             return node_id, gpu_ids
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 991d55ac861a4..9deb0294668ec 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -334,10 +334,10 @@ class NvmlCudaPlatform(CudaPlatformBase):
             if (len(set(device_names)) > 1
                     and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
                 logger.warning(
-                    "Detected different devices in the system: \n%s\nPlease"
+                    "Detected different devices in the system: %s. Please"
                     " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
                     "avoid unexpected behavior.",
-                    "\n".join(device_names),
+                    ", ".join(device_names),
                 )
 
 

From 78a141d768a18edc8c598a57d992e6aa56a33259 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 11 Feb 2025 12:56:03 +0530
Subject: [PATCH 08/43] [Misc] LoRA - Refactor Punica ops tests (#12970)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 tests/lora/test_punica_ops.py           | 652 ++++++++++++++++++++++++
 tests/lora/test_punica_ops_sizes.py     | 401 ---------------
 tests/lora/test_punica_ops_variation.py | 317 ------------
 tests/lora/utils.py                     |  41 +-
 4 files changed, 686 insertions(+), 725 deletions(-)
 create mode 100644 tests/lora/test_punica_ops.py
 delete mode 100644 tests/lora/test_punica_ops_sizes.py
 delete mode 100644 tests/lora/test_punica_ops_variation.py

diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
new file mode 100644
index 0000000000000..032e20470bcd3
--- /dev/null
+++ b/tests/lora/test_punica_ops.py
@@ -0,0 +1,652 @@
+# SPDX-License-Identifier: Apache-2.0
+from threading import Lock
+from typing import List
+
+import pytest
+import torch
+
+import vllm.lora.ops.triton_ops  # noqa: F401
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.platforms import current_platform
+
+from .utils import (PunicaTensors, assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices)
+
+
+# Utility shrink and expand operations used as reference implementations.
+def sgmv_shrink_for_nslices(
+        nslices: int, inputs_tensor: torch.Tensor,
+        lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor,
+        b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
+        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
+        num_tokens: int, scaling: float):
+    """
+    Wrapper around sgmv_shrink that handles any nslices.
+    """
+    for index in range(nslices):
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights_lst[index],
+            out_tensor[index],
+            b_seq_start_loc,
+            seq_len_tensor,
+            prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            num_tokens,
+            scaling,
+        )
+
+
+def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
+                            inputs_tensor: torch.Tensor,
+                            lora_weights_lst: List[torch.Tensor],
+                            out_tensor: torch.Tensor,
+                            b_seq_start_loc: torch.Tensor,
+                            seq_len_tensor: torch.Tensor,
+                            prompt_lora_mapping: torch.Tensor, batches: int,
+                            max_seq_length: int, num_tokens: int,
+                            add_inputs: bool) -> None:
+    """
+    Wrapper around sgmv_expand that handles any nslices.
+    """
+    if nslices == 1:
+        # Verify the torch's sgmv_expand op
+        sgmv_expand(
+            inputs_tensor[0],
+            lora_weights_lst[0],
+            out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            num_tokens,
+            add_inputs=add_inputs,
+        )
+    else:
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            sgmv_expand_slice(
+                inputs_tensor[index],
+                lora_weights,
+                out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                prompt_lora_mapping,
+                batches,
+                max_seq_length,
+                num_tokens,
+                slice_offset,
+                hidden_size,
+                add_inputs=add_inputs,
+            )
+            slice_offset += hidden_size
+
+
+_dict_lock = Lock()
+
+
+def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, nslices: int, dtype: torch.dtype,
+                      device: str, seq_length: int, scaling: float):
+    """
+    Compare outputs of vllm.sgmv_shrink kernel against a reference
+    implementation.
+    """
+    data: PunicaTensors = generate_data_for_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        nslices,
+        dtype,
+        "shrink",
+        device,
+    )
+    max_seq_length, token_nums = data.meta()
+
+    # Preventing cache error pointer.
+    with _dict_lock:
+        _LORA_A_PTR_DICT.clear()
+        torch.ops.vllm.sgmv_shrink(
+            data.inputs_tensor,
+            data.lora_weights,
+            data.our_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            scaling,
+        )
+
+        sgmv_shrink_for_nslices(
+            nslices,
+            data.inputs_tensor,
+            data.lora_weights,
+            data.ref_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            scaling,
+        )
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_sgmv_expand(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, nslices: int, dtype: torch.dtype,
+                      device: str, seq_length: int, add_inputs: bool):
+    """
+    Compare outputs of vllm.sgmv_expand kernel against a reference
+    implementation.
+    """
+    data: PunicaTensors = generate_data_for_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        nslices,
+        dtype,
+        "expand",
+        device,
+    )
+
+    max_seq_length, token_nums = data.meta()
+
+    with _dict_lock:
+        _LORA_B_PTR_DICT.clear()
+        torch.ops.vllm.sgmv_expand(
+            data.inputs_tensor,
+            data.lora_weights,
+            data.our_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            offset_start=0,
+            add_inputs=add_inputs,
+        )
+
+    sgmv_expand_for_nslices(nslices,
+                            hidden_size,
+                            data.inputs_tensor,
+                            data.lora_weights,
+                            data.ref_out_tensor,
+                            data.b_seq_start_loc,
+                            data.seq_len_tensor,
+                            data.prompt_lora_mapping,
+                            batches,
+                            max_seq_length,
+                            token_nums,
+                            add_inputs=add_inputs)
+
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, dtype: torch.dtype, device: str,
+                      scaling: float):
+    """
+    Compare vllm.bgmv_shrink against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        "shrink",
+        device,
+    )
+
+    torch.ops.vllm.bgmv_shrink(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.our_out_tensor,
+        data.token_lora_mapping,
+        scaling,
+    )
+
+    bgmv_shrink(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.ref_out_tensor,
+        data.token_lora_mapping,
+        scaling,
+    )
+
+    data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_expand(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, dtype: torch.dtype, device: str,
+                      add_inputs: bool):
+    """
+    Compare vllm.bgmv_expand against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        "expand",
+        device,
+    )
+
+    torch.ops.vllm.bgmv_expand(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.our_out_tensor,
+        data.token_lora_mapping,
+        add_inputs=add_inputs,
+    )
+    bgmv_expand(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.ref_out_tensor,
+        data.token_lora_mapping,
+        add_inputs=add_inputs,
+    )
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
+                            hidden_size: int, nslices: int, dtype: torch.dtype,
+                            device: str, add_inputs: bool):
+    """
+    Compare vllm.bgmv_expand_slice against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+
+    slice_offset = 0
+    for index in range(nslices):
+        torch.ops.vllm.bgmv_expand_slice(
+            data.inputs_tensor,
+            data.lora_weights[index],
+            data.our_out_tensor,
+            data.token_lora_mapping,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=add_inputs,
+        )
+        bgmv_expand_slice(
+            data.inputs_tensor,
+            data.lora_weights[index],
+            data.ref_out_tensor,
+            data.token_lora_mapping,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=add_inputs,
+        )
+
+        slice_offset += hidden_size
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+# Tests
+# We test the punica kernels along 2 verticals mainly.
+# 1. Variations in hidden_dim size
+# 2. Variations in all other parameters like (batch_size, max_rank, num_loras
+#  etc.)
+
+# We have collected the hidden_sizes included in the LoRA models
+# currently supported by vLLM. It tests whether the corresponding Triton
+# kernel can run normally when tensor parallelism is set to
+# [1, 2, 4, 8, 16, 32, 64].
+HIDDEN_SIZES = [
+    128,
+    256,
+    512,
+    896,
+    1024,
+    1152,
+    1216,
+    1280,
+    1536,
+    1664,
+    2048,
+    2240,
+    2304,
+    2368,
+    2432,
+    2560,
+    2752,
+    3072,
+    3328,
+    3456,
+    3584,
+    3712,
+    4096,
+    4480,
+    4608,
+    4736,
+    4864,
+    5120,
+    5504,
+    5632,
+    5888,
+    6144,
+    6400,
+    6848,
+    6912,
+    7168,
+    7424,
+    8192,
+    8960,
+    9216,
+    9472,
+    10240,
+    11008,
+    11264,
+    13824,
+    14336,
+    14784,
+    14848,
+    15360,
+    18944,
+    22016,
+    22528,
+    24576,
+    27392,
+    27648,
+    29568,
+    29696,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    49408,
+    60544,
+    60672,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+#The size of TP
+divisibility = [1, 2, 8, 16, 64]
+
+all_hidden_size = []
+for div in divisibility:
+    for hidden_size in HIDDEN_SIZES:
+        all_hidden_size.append(hidden_size // div)
+
+HIDDEN_SIZES = list(set(all_hidden_size))
+
+# Test params that focuses on hidden_size variation.
+hs_test_params = {
+    "hidden_sizes": HIDDEN_SIZES,
+    "batches": [4],
+    "num_loras": [4],
+    "max_ranks": [32],
+}
+
+# General tests params that tests for variations in all dimensions
+# except hidden_size.
+test_params = {
+    "hidden_sizes": [2049],
+    "batches": [1, 4, 16, 32],
+    "num_loras": [1, 8, 32, 128],
+    "max_ranks": [1, 4, 8, 16, 32, 64, 128, 256],
+}
+
+DTYPES = [torch.float16, torch.bfloat16]
+DEVICES = [f"cuda:{0}"]
+SEED = [0]
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [1, 2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_sgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_sgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          scaling=0.5)
+    else:
+        check_sgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [1, 2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_sgmv_hidden_size(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_sgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          scaling=0.5)
+    else:
+        check_sgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_bgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          scaling=0.5)
+    else:
+        check_bgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_bgmv_hidden_size(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_bgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          scaling=0.5)
+    else:
+        check_bgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
+                                    hidden_size: int, nslices: int,
+                                    dtype: torch.dtype, device: str,
+                                    seed: int):
+
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    check_bgmv_expand_slice(batches=batches,
+                            num_loras=num_loras,
+                            rank=rank,
+                            hidden_size=hidden_size,
+                            nslices=nslices,
+                            dtype=dtype,
+                            device=device,
+                            add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
+                                                rank: int, hidden_size: int,
+                                                nslices: int,
+                                                dtype: torch.dtype,
+                                                device: str, seed: int):
+
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    check_bgmv_expand_slice(batches=batches,
+                            num_loras=num_loras,
+                            rank=rank,
+                            hidden_size=hidden_size,
+                            nslices=nslices,
+                            dtype=dtype,
+                            device=device,
+                            add_inputs=True)
diff --git a/tests/lora/test_punica_ops_sizes.py b/tests/lora/test_punica_ops_sizes.py
deleted file mode 100644
index ecd3bc4978f39..0000000000000
--- a/tests/lora/test_punica_ops_sizes.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This script is mainly used to tests various hidden_sizes. We have collected the
-hidden_sizes included in the LoRA models currently supported by vLLM. It tests
-whether the corresponding Triton kernel can run normally when tensor parallelism
-is set to [1, 2, 4, 8, 16, 32, 64].
-"""
-from threading import Lock
-
-import pytest
-import torch
-
-import vllm.lora.ops.triton_ops  # noqa: F401
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
-from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.platforms import current_platform
-
-from .utils import (assert_close, generate_data,
-                    generate_data_for_expand_nslices,
-                    generate_data_for_nslices)
-
-HIDDEN_SIZES = [
-    128,
-    256,
-    512,
-    896,
-    1024,
-    1152,
-    1216,
-    1280,
-    1536,
-    1664,
-    2048,
-    2240,
-    2304,
-    2368,
-    2432,
-    2560,
-    2752,
-    3072,
-    3328,
-    3456,
-    3584,
-    3712,
-    4096,
-    4480,
-    4608,
-    4736,
-    4864,
-    5120,
-    5504,
-    5632,
-    5888,
-    6144,
-    6400,
-    6848,
-    6912,
-    7168,
-    7424,
-    8192,
-    8960,
-    9216,
-    9472,
-    10240,
-    11008,
-    11264,
-    13824,
-    14336,
-    14784,
-    14848,
-    15360,
-    18944,
-    22016,
-    22528,
-    24576,
-    27392,
-    27648,
-    29568,
-    29696,
-    32000,
-    32256,
-    32512,
-    32768,
-    33024,
-    36864,
-    43264,
-    49152,
-    49408,
-    60544,
-    60672,
-    64000,
-    64256,
-    102400,
-    102656,
-    128000,
-    128256,
-]
-#The size of TP
-divisibility = [1, 2, 8, 16, 64]
-
-all_hidden_size = []
-for div in divisibility:
-    for hidden_size in HIDDEN_SIZES:
-        all_hidden_size.append(hidden_size // div)
-
-HIDDEN_SIZES = list(set(all_hidden_size))
-
-BATCHES = [4]
-NUM_LORA = [4]
-DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [32]
-SCALES = [0.5]
-SEED = [0]
-DEVICES = [f"cuda:{0}"]
-
-_dict_lock = Lock()
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("nslices", [1, 2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_sgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    nslices: int,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 128
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        nslices,
-        dtype,
-        op_type,
-        device,
-    )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        # Preventing cache error pointer.
-        with _dict_lock:
-            _LORA_A_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-        for index in range(nslices):
-            sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst[index],
-                ref_out_tensor[index],
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-
-    else:
-        with _dict_lock:
-            _LORA_B_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_expand(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                offset_start=0,
-                add_inputs=True,
-            )
-        if nslices == 1:
-            # Verify the torch's sgmv_expand op
-            sgmv_expand(
-                inputs_tensor[0],
-                lora_weights_lst[0],
-                ref_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                add_inputs=True,
-            )
-        else:
-            slice_offset = 0
-            for index in range(nslices):
-                lora_weights = lora_weights_lst[index]
-                sgmv_expand_slice(
-                    inputs_tensor[index],
-                    lora_weights,
-                    ref_out_tensor,
-                    b_seq_start_loc,
-                    seq_len_tensor,
-                    lora_indices_tensor,
-                    batches,
-                    max_seq_length,
-                    token_nums,
-                    slice_offset,
-                    hidden_size,
-                    add_inputs=True,
-                )
-                slice_offset += hidden_size
-
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        op_type,
-        device,
-    )
-    if op_type == "shrink":
-        torch.ops.vllm.bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            scaling,
-        )
-
-        bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            scaling,
-        )
-
-    else:
-        torch.ops.vllm.bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-        bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv_expand_nslices(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    nslices: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_outputs,
-        ref_outputs,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_expand_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        nslices,
-        device,
-    )
-    slice_offset = 0
-    for index in range(nslices):
-        lora_weights = lora_weights_lst[index]
-        torch.ops.vllm.bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            our_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-        bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            ref_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-
-        slice_offset += hidden_size
-    assert_close(our_outputs, ref_outputs)
diff --git a/tests/lora/test_punica_ops_variation.py b/tests/lora/test_punica_ops_variation.py
deleted file mode 100644
index 6d1d3c9430f38..0000000000000
--- a/tests/lora/test_punica_ops_variation.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This script is mainly used to test whether trtion kernels can run normally
-under different conditions, including various batches, numbers of LoRA , and
-maximum ranks.
-"""
-from threading import Lock
-
-import pytest
-import torch
-
-# Enable custom op register
-import vllm.lora.ops.triton_ops  # noqa: F401
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
-from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.platforms import current_platform
-
-from .utils import (assert_close, generate_data,
-                    generate_data_for_expand_nslices,
-                    generate_data_for_nslices)
-
-HIDDEN_SIZES = [2049]
-
-BATCHES = [1, 4, 16, 32]
-NUM_LORA = [1, 8, 32, 128]
-DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
-SCALES = [0.5]
-SEED = [0]
-DEVICES = [f"cuda:{0}"]
-
-_dict_lock = Lock()
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("nslices", [1, 2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_sgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    nslices: int,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 128
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        nslices,
-        dtype,
-        op_type,
-        device,
-    )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        # Preventing cache error pointer.
-        with _dict_lock:
-            _LORA_A_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-        for index in range(nslices):
-            sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst[index],
-                ref_out_tensor[index],
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-
-    else:
-        with _dict_lock:
-            _LORA_B_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_expand(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                offset_start=0,
-                add_inputs=True,
-            )
-        slice_offset = 0
-        if nslices == 1:
-            # Verify the torch's sgmv_expand op
-            sgmv_expand(
-                inputs_tensor[0],
-                lora_weights_lst[0],
-                ref_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                add_inputs=True,
-            )
-        else:
-            for index in range(nslices):
-                lora_weights = lora_weights_lst[index]
-                sgmv_expand_slice(
-                    inputs_tensor[index],
-                    lora_weights,
-                    ref_out_tensor,
-                    b_seq_start_loc,
-                    seq_len_tensor,
-                    lora_indices_tensor,
-                    batches,
-                    max_seq_length,
-                    token_nums,
-                    slice_offset,
-                    hidden_size,
-                    add_inputs=True,
-                )
-                slice_offset += hidden_size
-
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        op_type,
-        device,
-    )
-    if op_type == "shrink":
-        torch.ops.vllm.bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            scaling,
-        )
-
-        bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            scaling,
-        )
-
-    else:
-        torch.ops.vllm.bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-        bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv_expand_nslices(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    nslices: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_outputs,
-        ref_outputs,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_expand_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        nslices,
-        device,
-    )
-    slice_offset = 0
-    for index in range(nslices):
-        lora_weights = lora_weights_lst[index]
-        torch.ops.vllm.bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            our_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-        bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            ref_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-
-        slice_offset += hidden_size
-    assert_close(our_outputs, ref_outputs)
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index bda00e08190ef..1e163fbf97ce3 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Optional
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -106,6 +107,31 @@ def assert_close(a, b):
     torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
+@dataclass
+class PunicaTensors:
+    inputs_tensor: torch.Tensor
+    lora_weights: Union[torch.Tensor, List[torch.Tensor]]
+    our_out_tensor: torch.Tensor
+    ref_out_tensor: torch.Tensor
+    b_seq_start_loc: torch.Tensor
+    prompt_lora_mapping: torch.Tensor
+    seq_len_tensor: torch.Tensor
+    token_lora_mapping: torch.Tensor
+
+    def meta(self) -> Tuple[int, int]:
+        """
+        Infer max_seq_length and token_nums from the tensors
+        and return them.
+        """
+        max_seq_length = self.seq_len_tensor.max()
+        token_nums = self.seq_len_tensor.sum().item()
+        if isinstance(max_seq_length, tuple):
+            max_seq_length = max_seq_length[0].item()
+        else:
+            max_seq_length = max_seq_length.item()
+        return max_seq_length, token_nums
+
+
 def generate_data(
     batches,
     hidden_size,
@@ -115,7 +141,7 @@ def generate_data(
     dtype,
     op_type,
     device,
-):
+) -> PunicaTensors:
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -164,7 +190,8 @@ def generate_data(
         indices[current_offset:current_offset +
                 seq_len_tensor[b_id]].copy_(lora_index)
         current_offset += seq_len_tensor[b_id].item()
-    return (
+
+    return PunicaTensors(
         inputs_tensor,
         lora_weights,
         our_out_tensor,
@@ -185,7 +212,7 @@ def generate_data_for_expand_nslices(
     dtype,
     nslices,
     device,
-):
+) -> PunicaTensors:
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -222,7 +249,7 @@ def generate_data_for_expand_nslices(
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
-    return (
+    return PunicaTensors(
         inputs_tensor,
         lora_weights_lst,
         our_out_tensor,
@@ -244,7 +271,7 @@ def generate_data_for_nslices(
     dtype,
     op_type,
     device,
-):
+) -> PunicaTensors:
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -302,7 +329,7 @@ def generate_data_for_nslices(
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
-    return (
+    return PunicaTensors(
         inputs_tensor,
         lora_weights_lst,
         our_out_tensor,

From fc6485d27750076642e99a1ef2df0e6375958bb4 Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Tue, 11 Feb 2025 15:49:03 +0800
Subject: [PATCH 09/43] [Bugfix]: Reasoning output bug according to the chat
 template change (#13025)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
---
 .../openai_chat_completion_with_reasoning.py  |   8 +-
 .../test_deepseekr1_reasoning_parser.py       | 108 +++++++++++++++---
 .../deepseek_r1_reasoning_parser.py           |  54 +++++----
 3 files changed, 127 insertions(+), 43 deletions(-)

diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index a88c8adb55c28..b5dbed1205d35 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -36,8 +36,8 @@ response = client.chat.completions.create(model=model, messages=messages)
 reasoning_content = response.choices[0].message.reasoning_content
 content = response.choices[0].message.content
 
-print("reasoning_content:", reasoning_content)
-print("content:", content)
+print("reasoning_content for Round 1:", reasoning_content)
+print("content for Round 1:", content)
 
 # Round 2
 messages.append({"role": "assistant", "content": content})
@@ -50,5 +50,5 @@ response = client.chat.completions.create(model=model, messages=messages)
 reasoning_content = response.choices[0].message.reasoning_content
 content = response.choices[0].message.content
 
-print("reasoning_content:", reasoning_content)
-print("content:", content)
+print("reasoning_content for Round 2:", reasoning_content)
+print("content for Round 2:", content)
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
index f7b81be48bd11..fdadb2e21ff80 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -15,32 +15,62 @@ start_token = "<think>"
 end_token = "</think>"
 
 SIMPLE_REASONING = {
-    "output": "<think>This is a reasoning section</think>This is the rest",
+    "output": "This is a reasoning section</think>This is the rest",
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING = {
-    "output": "<think>This is a reasoning section</think>",
+    "output": "This is a reasoning section</think>",
     "reasoning_content": "This is a reasoning section",
     "content": None,
 }
 NO_REASONING = {
-    "output": "This is a reasoning section",
+    "output": "This is content",
     "reasoning_content": None,
-    "content": "This is a reasoning section",
+    "content": "This is content",
+}
+NO_REASONING_STREAMING = {
+    "output": "This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
 }
 MULTIPLE_LINES = {
-    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "output": "This\nThat</think>This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
 }
 SHORTEST_REASONING_NO_STREAMING = {
-    "output": "<think></think>This is the rest",
+    "output": "</think>This is the rest",
     "reasoning_content": "",
     "content": "This is the rest",
 }
 SHORTEST_REASONING = {
-    "output": "<think></think>This is the rest",
+    "output": "</think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+REASONING_WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
+    "output": "</think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+}
+SHORTEST_REASONING_WITH_THINK = {
+    "output": "</think>This is the rest",
     "reasoning_content": None,
     "content": "This is the rest",
 }
@@ -49,37 +79,37 @@ TEST_CASES = [
     pytest.param(
         False,
         SIMPLE_REASONING,
-        id="simple_streaming",
+        id="simple_reasoning",
     ),
     pytest.param(
         True,
         SIMPLE_REASONING,
-        id="simple_streaming",
+        id="simple_reasoning_streaming",
     ),
     pytest.param(
         False,
         COMPLETE_REASONING,
-        id="complete_streaming",
+        id="complete_reasoning",
     ),
     pytest.param(
         True,
         COMPLETE_REASONING,
-        id="complete_streaming",
+        id="complete_reasoning_streaming",
     ),
     pytest.param(
         False,
         NO_REASONING,
-        id="no_streaming",
+        id="no_reasoning_token",
     ),
     pytest.param(
         True,
-        NO_REASONING,
-        id="no_streaming",
+        NO_REASONING_STREAMING,
+        id="no_reasoning_token_streaming",
     ),
     pytest.param(
         False,
         MULTIPLE_LINES,
-        id="multiple_lines_streaming",
+        id="multiple_lines",
     ),
     pytest.param(
         True,
@@ -89,23 +119,65 @@ TEST_CASES = [
     pytest.param(
         True,
         SHORTEST_REASONING,
-        id="shortest_streaming",
+        id="shortest",
     ),
     pytest.param(
         False,
         SHORTEST_REASONING_NO_STREAMING,
         id="shortest_streaming",
     ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
+        id="shortest_with_think",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING_WITH_THINK,
+        id="shortest_with_think_streaming",
+    ),
 ]
 
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+tokenizer.add_tokens([start_token, end_token])
+
 
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
     streaming: bool,
     param_dict: dict,
 ):
-    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-    tokenizer.add_tokens([start_token, end_token])
     output = tokenizer.tokenize(param_dict["output"])
     # decode everything to tokens
     output_tokens: List[str] = [
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
index 5c19888d45401..33bba04882be6 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -67,6 +67,8 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
         ]):
             return None
 
+        # Check if <think> is present in previous or delta.
+        # Keep compatibility with models that don't generate <think> tokens.
         if self.think_start_token_id in previous_token_ids:
             if self.think_end_token_id in delta_token_ids:
                 # <think> in previous, </think> in delta,
@@ -85,7 +87,6 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
                 # reasoning content continues
                 return DeltaMessage(reasoning_content=delta_text)
         elif self.think_start_token_id in delta_token_ids:
-            logger.info(delta_text)
             if self.think_end_token_id in delta_token_ids:
                 # <think> in delta, </think> in delta, extract reasoning content
                 start_index = delta_text.find(self.think_start_token)
@@ -101,35 +102,46 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
                 # reasoning content continues
                 return DeltaMessage(reasoning_content=delta_text)
         else:
-            # No <think> in previous or delta, reasoning content continues.
-            return DeltaMessage(content=delta_text)
+            # No <think> in previous or delta, also need to check for </think>.
+            # Because the model may have generated </think> without <think>
+            # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+            if self.think_end_token_id in delta_token_ids:
+                # </think> in delta with more tokens,
+                # extract reasoning content and content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            elif self.think_end_token_id in previous_token_ids:
+                # </think> in previous, thinking content ends
+                return DeltaMessage(content=delta_text)
+            else:
+                # no </think> in previous or delta, reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
 
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> Tuple[Optional[str], Optional[str]]:
 
-        # Check if the model output contains the <think> tokens.
-        if (self.think_start_token not in model_output
-                or self.think_end_token not in model_output):
+        # DeepSeek R1 doesn't generate <think> now.
+        # Thus we assume the reasoning content is always at the start.
+        # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+        if self.think_end_token not in model_output:
             return None, model_output
         else:
+            # Add a start token if it's missing to keep compatibility.
+            if self.think_start_token not in model_output:
+                model_output = f"{self.think_start_token}{model_output}"
             # Use a regex to find the reasoning content
             reasoning_content = self.reasoning_regex.findall(model_output)[0]
 
-            # Remove the reasoning content from the model output
-            # Although deepseek's <think> token is always at the
-            # beginning of the line, we cannot guarantee that the
-            # other models will follow this convention.
-            # Therefore, we need to add :start_index.
-            start_index = model_output.find(self.think_start_token)
-            if start_index != -1:
-                end_index = start_index + len(
-                    f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
-                )
-                model_output = model_output[:start_index] + \
-                                model_output[end_index:]
+            end_index = len(
+                f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
+            )
+            final_output = model_output[end_index:]
 
-                if len(model_output) == 0:
-                    return reasoning_content, None
+            if len(final_output) == 0:
+                return reasoning_content, None
 
-            return reasoning_content, model_output
+            return reasoning_content, final_output

From 41c5dd45b98d5a6facad328a1ce534b9a94763a2 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 11 Feb 2025 00:27:25 -0800
Subject: [PATCH 10/43] [V1][Metrics] Add GPU prefix cache hit rate % gauge
 (#12592)

---
 tests/entrypoints/openai/test_metrics.py |  2 +
 tests/v1/core/test_kv_cache_utils.py     | 39 ++++++++++++++-
 vllm/v1/core/kv_cache_manager.py         | 24 +++++++++
 vllm/v1/core/kv_cache_utils.py           | 64 ++++++++++++++++++++++++
 vllm/v1/core/scheduler.py                |  1 +
 vllm/v1/metrics/loggers.py               | 29 ++++++++++-
 vllm/v1/metrics/stats.py                 | 20 +++++++-
 7 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index de2333901cc91..8c1bb1a897e37 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -203,6 +203,8 @@ EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
     "vllm:gpu_cache_usage_perc",
+    "vllm:gpu_prefix_cache_queries",
+    "vllm:gpu_prefix_cache_hits",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
     "vllm:request_success_total",
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 8df4cbe1be71b..ba08b83ec54e5 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -5,10 +5,11 @@ import pytest
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock,
+                                         KVCacheBlock, PrefixCachingMetrics,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens)
+from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
 
@@ -277,3 +278,39 @@ def test_hash_request_tokens_no_mm_inputs():
     assert block_hashes[0].extra_keys is None
     assert block_hashes[1].token_ids == (3, 4, 5)
     assert block_hashes[1].extra_keys is None
+
+
+def test_metrics():
+    """
+    Test the prefix caching metrics.
+    """
+
+    def stats(requests, queries, hits):
+        return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
+
+    metrics = PrefixCachingMetrics(interval=5)
+    assert metrics.hit_rate == 0.0
+
+    metrics.observe(stats(1, 20, 9))
+    # 9 / 20 = 0.45
+    assert metrics.hit_rate == 0.45
+
+    metrics.observe(stats(4, 80, 16))
+
+    # 25 / 100 = 0.25
+    assert metrics.hit_rate == 0.25
+
+    metrics.observe(stats(1, 10, 2))
+
+    # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
+    assert metrics.aggregated_requests == 5
+    assert metrics.aggregated_query_total == 90
+    assert metrics.aggregated_query_hit == 18
+    assert metrics.hit_rate == 0.2
+
+    metrics.reset()
+    assert metrics.hit_rate == 0.0
+    assert metrics.aggregated_requests == 0
+    assert metrics.aggregated_query_total == 0
+    assert metrics.aggregated_query_hit == 0
+    assert not metrics.query_queue
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index f8d08d0e40236..f75d31f542cf7 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -10,6 +10,7 @@ from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens)
+from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
 
 logger = init_logger(__name__)
@@ -78,11 +79,28 @@ class KVCacheManager:
         self.req_to_block_hashes: DefaultDict[
             str, List[BlockHashType]] = defaultdict(list)
 
+        self.prefix_cache_stats = PrefixCacheStats()
+
     @property
     def usage(self) -> float:
+        """Get the KV cache usage.
+
+        Returns:
+            The KV cache usage (between 0.0 and 1.0).
+        """
         return 1.0 - (self.free_block_queue.num_free_blocks /
                       self.num_gpu_blocks)
 
+    def make_prefix_cache_stats(self) -> PrefixCacheStats:
+        """Get (and reset) the prefix cache stats.
+
+        Returns:
+            The current prefix caching stats.
+        """
+        stats = self.prefix_cache_stats
+        self.prefix_cache_stats = PrefixCacheStats()
+        return stats
+
     def get_computed_blocks(
             self, request: Request) -> Tuple[List[KVCacheBlock], int]:
         """Get the computed (cached) blocks for the request.
@@ -118,6 +136,10 @@ class KVCacheManager:
             else:
                 break
 
+        self.prefix_cache_stats.requests += 1
+        self.prefix_cache_stats.queries += len(block_hashes)
+        self.prefix_cache_stats.hits += len(computed_blocks)
+
         # NOTE(woosuk): Since incomplete blocks are not eligible for
         # sharing, `num_computed_tokens` is always a multiple of
         # `block_size`.
@@ -280,6 +302,8 @@ class KVCacheManager:
         for block in self.block_pool:
             block.reset_hash()
 
+        self.prefix_cache_stats.reset = True
+
         logger.info("Successfully reset prefix cache")
         return True
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 6888f1a3e1823..bddb482d29167 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """KV-Cache Utilities."""
+from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import Any, List, NamedTuple, Optional, Tuple
@@ -8,6 +9,7 @@ from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec,
                                         KVCacheTensor)
+from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -28,6 +30,68 @@ class BlockHashType(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+class PrefixCachingMetrics:
+    """Metrics for prefix caching with a hit rate of the most recent N requests.
+
+    Args:
+        interval: The number of the most recent requests to aggregate.
+            Defaults to 1000.
+    """
+
+    def __init__(self, interval: int = 1000):
+        self.interval = interval
+        # The current aggregated values.
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        # A deque of (requests, queries, hits) for the most recent requests.
+        self.query_queue: deque[Tuple[int, int, int]] = deque()
+
+    def observe(self, stats: PrefixCacheStats):
+        """Observe the prefix caching for a set of requests.
+
+        This function is called with information gathered when new requests
+        are being scheduled and are looking for computed blocks.
+
+        When there are more than `interval` requests, the oldest set of
+        requestsare removed from the metrics.
+
+        Args:
+            stats: The prefix cache stats.
+        """
+        # reset_prefix_cache was invoked before the current update.
+        # Reset the metrics before aggregating the current stats.
+        if stats.reset:
+            self.reset()
+
+        # Update the metrics.
+        self.query_queue.append((stats.requests, stats.queries, stats.hits))
+        self.aggregated_requests += stats.requests
+        self.aggregated_query_total += stats.queries
+        self.aggregated_query_hit += stats.hits
+
+        # Remove the oldest stats if the number of requests exceeds.
+        if self.aggregated_requests > self.interval:
+            old_requests, old_queries, old_hits = self.query_queue.popleft()
+            self.aggregated_requests -= old_requests
+            self.aggregated_query_total -= old_queries
+            self.aggregated_query_hit -= old_hits
+
+    def reset(self):
+        """Reset the metrics."""
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        self.query_queue.clear()
+
+    @property
+    def hit_rate(self) -> float:
+        """Calculate the hit rate for the past N requests."""
+        if self.aggregated_query_total == 0:
+            return 0.0
+        return self.aggregated_query_hit / self.aggregated_query_total
+
+
 @dataclass
 class KVCacheBlock:
     """KV-cache block metadata."""
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 1c54914d182ba..985fcf01bb216 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -593,4 +593,5 @@ class Scheduler:
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
+            prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
         )
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index eb1acf584c6b0..3472761dc1808 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -9,6 +9,7 @@ import prometheus_client
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
@@ -37,6 +38,9 @@ class LoggingStatLogger(StatLoggerBase):
         self.num_prompt_tokens: List[int] = []
         self.num_generation_tokens: List[int] = []
 
+        # Prefix cache metrics. TODO: Make the interval configurable.
+        self.prefix_caching_metrics = PrefixCachingMetrics()
+
     def _local_interval_elapsed(self, now: float) -> bool:
         # Log every _LOCAL_LOGGING_INTERVAL_SEC.
         elapsed_time = now - self.last_log_time
@@ -58,6 +62,8 @@ class LoggingStatLogger(StatLoggerBase):
 
         self._track_iteration_stats(iteration_stats)
 
+        self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
+
         now = time.monotonic()
         if not self._local_interval_elapsed(now):
             return
@@ -72,13 +78,15 @@ class LoggingStatLogger(StatLoggerBase):
         logger.info(
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
-            "Running: %d reqs, Waiting: %d reqs "
-            "GPU KV cache usage: %.1f%%.",
+            "Running: %d reqs, Waiting: %d reqs, "
+            "GPU KV cache usage: %.1f%%, "
+            "Prefix cache hit rate: %.1f%%",
             prompt_throughput,
             generation_throughput,
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
             scheduler_stats.gpu_cache_usage * 100,
+            self.prefix_caching_metrics.hit_rate * 100,
         )
 
 
@@ -107,6 +115,18 @@ class PrometheusStatLogger(StatLoggerBase):
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
+            name="vllm:gpu_prefix_cache_queries",
+            documentation=
+            "GPU prefix cache queries, in terms of number of queried blocks.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
+            name="vllm:gpu_prefix_cache_hits",
+            documentation=
+            "GPU prefix cache hits, in terms of number of cached blocks.",
+            labelnames=labelnames).labels(*labelvalues)
+
         self.counter_prompt_tokens = prometheus_client.Counter(
             name="vllm:prompt_tokens_total",
             documentation="Number of prefill tokens processed.",
@@ -170,6 +190,11 @@ class PrometheusStatLogger(StatLoggerBase):
 
         self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
 
+        self.counter_gpu_prefix_cache_queries.inc(
+            scheduler_stats.prefix_cache_stats.queries)
+        self.counter_gpu_prefix_cache_hits.inc(
+            scheduler_stats.prefix_cache_stats.hits)
+
         self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens.inc(
             iteration_stats.num_generation_tokens)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 5e588d35ea4d7..f806b0adf5d5a 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
@@ -9,6 +9,20 @@ if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreOutput, FinishReason
 
 
+@dataclass
+class PrefixCacheStats:
+    """Stores prefix cache hit statistics."""
+    # Whether reset_prefix_cache was invoked.
+    reset: bool = False
+    # The number of requests in this update.
+    requests: int = 0
+    # The number of queries in these requests. Note that "queries" here
+    # means the number of blocks that were queried from the cache.
+    queries: int = 0
+    # The number of hits in these requests.
+    hits: int = 0
+
+
 @dataclass
 class SchedulerStats:
     """Stats associated with the scheduler."""
@@ -17,7 +31,9 @@ class SchedulerStats:
     num_waiting_reqs: int = 0
 
     gpu_cache_usage: float = 0.0
-    # gpu_prefix_cache_hit_rate: float = 0.0
+
+    prefix_cache_stats: PrefixCacheStats = field(
+        default_factory=PrefixCacheStats)
 
 
 @dataclass

From 9cf4759493919580011f03812abf16387eafe18c Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Tue, 11 Feb 2025 21:20:53 +0800
Subject: [PATCH 11/43] [executor] init `local_rank` as device index (#13027)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
---
 vllm/executor/uniproc_executor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index e5464cafaecbf..94db232240d55 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -28,6 +28,11 @@ class UniProcExecutor(ExecutorBase):
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
         local_rank = 0
+        # set local rank as the device index if specified
+        device_info = self.vllm_config.device_config.device.__str__().split(
+            ":")
+        if len(device_info) > 1:
+            local_rank = int(device_info[1])
         rank = 0
         kwargs = dict(
             vllm_config=self.vllm_config,

From 7539bbc6a6715dc8e5e71730e2377219b0e69e21 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Tue, 11 Feb 2025 08:47:10 -0500
Subject: [PATCH 12/43] [ROCm] Using a more precise memory profiling (#12624)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
---
 vllm/platforms/rocm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1f690b7111ee2..13aebc605af74 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -169,4 +169,5 @@ class RocmPlatform(Platform):
                                  device: Optional[torch.types.Device] = None
                                  ) -> float:
         torch.cuda.reset_peak_memory_stats(device)
-        return torch.cuda.max_memory_allocated(device)
+        return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(
+            device)[0]

From da317197dd9352a7718d9bf697c2a5aeb9d42b41 Mon Sep 17 00:00:00 2001
From: Yuhong Guo <yuhong.gyh@antgroup.com>
Date: Tue, 11 Feb 2025 21:55:57 +0800
Subject: [PATCH 13/43] [Build] Fix cuda link target of cumem_allocator in CPU
 env (#12863)

Signed-off-by: YuhongGuo <yuhong.gyh@antgroup.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b99061dfde4fd..a0fd346c6c153 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,7 @@ set_gencode_flags_for_srcs(
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   message(STATUS "Enabling cumem allocator extension.")
   # link against cuda driver library
-  list(APPEND CUMEM_LIBS cuda)
+  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
   define_gpu_extension_target(
     cumem_allocator
     DESTINATION vllm

From 2e3b969ec0d46e2cfff041a07f29a2ca4bb82bbd Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 11 Feb 2025 22:06:46 +0800
Subject: [PATCH 14/43] [Platform] add pre_register_and_update function
 (#12432)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/config.py              |  3 ++-
 vllm/engine/arg_utils.py    | 21 +++++++++++++++++++++
 vllm/platforms/interface.py | 18 ++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 426ba38080270..1d8c42dd276a6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3057,7 +3057,8 @@ class VllmConfig:
     kv_transfer_config: KVTransferConfig = field(default=None,
                                                  init=True)  # type: ignore
     # some opaque config, only used to provide additional information
-    # for the hash computation, mainly used for testing and debugging.
+    # for the hash computation, mainly used for testing, debugging or out of
+    # tree config registration.
     additional_config: SupportsHash = field(default=None,
                                             init=True)  # type: ignore
     instance_id: str = ""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 40c6fb4567993..4232ad9204f44 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -20,6 +20,7 @@ from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.plugins import load_general_plugins
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
@@ -203,6 +204,8 @@ class EngineArgs:
 
     calculate_kv_scales: Optional[bool] = None
 
+    additional_config: Optional[Dict[str, Any]] = None
+
     def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
@@ -984,6 +987,14 @@ class EngineArgs:
             'be loaded from the model checkpoint if available. '
             'Otherwise, the scales will default to 1.0.')
 
+        parser.add_argument(
+            "--additional-config",
+            type=json.loads,
+            default=None,
+            help="Additional config for specified platform in JSON format. "
+            "Different platforms may support different configs. Make sure the "
+            "configs are valid for the platform you are using. The input format"
+            " is like '{\"config_key\":\"config_value\"}'")
         return parser
 
     @classmethod
@@ -1044,6 +1055,9 @@ class EngineArgs:
     def create_engine_config(self,
                              usage_context: Optional[UsageContext] = None
                              ) -> VllmConfig:
+        from vllm.platforms import current_platform
+        current_platform.pre_register_and_update()
+
         if envs.VLLM_USE_V1:
             self._override_v1_engine_args(usage_context)
 
@@ -1287,6 +1301,7 @@ class EngineArgs:
             prompt_adapter_config=prompt_adapter_config,
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,
+            additional_config=self.additional_config,
         )
 
         if envs.VLLM_USE_V1:
@@ -1347,6 +1362,12 @@ class AsyncEngineArgs(EngineArgs):
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')
+        # Initialize plugin to update the parser, for example, The plugin may
+        # adding a new kind of quantization method to --quantization argument or
+        # a new device to --device argument.
+        load_general_plugins()
+        from vllm.platforms import current_platform
+        current_platform.pre_register_and_update(parser)
         return parser
 
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 645d98a1bb42c..61673b08543f6 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -13,8 +13,10 @@ from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.utils import FlexibleArgumentParser
 else:
     VllmConfig = None
+    FlexibleArgumentParser = None
 
 logger = init_logger(__name__)
 
@@ -223,6 +225,22 @@ class Platform:
             np.random.seed(seed)
             torch.manual_seed(seed)
 
+    @classmethod
+    def pre_register_and_update(cls,
+                                parser: Optional[FlexibleArgumentParser] = None
+                                ) -> None:
+        """
+        Do some pre-registeration or update action for the current platform.
+
+        This function is called before global VllmConfig is initialized or cli
+        arguments are parsed. It's used for out-of-tree platforms to register or
+        update the configuration.
+
+        For example, the out-of-tree quantization config can be imported and
+        registered here dynamically.
+        """
+        pass
+
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         """

From 110f59a33e22aaa16a1d0278bb19f76e4fe5f5a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E0=AE=AE=E0=AE=A9=E0=AF=8B=E0=AE=9C=E0=AF=8D=E0=AE=95?=
 =?UTF-8?q?=E0=AF=81=E0=AE=AE=E0=AE=BE=E0=AE=B0=E0=AF=8D=20=E0=AE=AA?=
 =?UTF-8?q?=E0=AE=B4=E0=AE=A9=E0=AE=BF=E0=AE=9A=E0=AF=8D=E0=AE=9A=E0=AE=BE?=
 =?UTF-8?q?=E0=AE=AE=E0=AE=BF?= <smartmanoj42857@gmail.com>
Date: Tue, 11 Feb 2025 20:11:20 +0530
Subject: [PATCH 15/43] [Bugfix] fix flaky test (#13089)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com>
---
 tests/test_seed_behavior.py | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py
index 7e4e71563e7d3..c45ed6926d772 100644
--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
@@ -8,32 +8,17 @@ from vllm.platforms.interface import Platform
 
 
 def test_seed_behavior():
-    # Test with seed=None
-    Platform.seed_everything(None)
+    # Test with a specific seed
+    Platform.seed_everything(42)
     random_value_1 = random.randint(0, 100)
     np_random_value_1 = np.random.randint(0, 100)
     torch_random_value_1 = torch.randint(0, 100, (1, )).item()
 
-    Platform.seed_everything(None)
+    Platform.seed_everything(42)
     random_value_2 = random.randint(0, 100)
     np_random_value_2 = np.random.randint(0, 100)
     torch_random_value_2 = torch.randint(0, 100, (1, )).item()
 
-    assert random_value_1 != random_value_2
-    assert np_random_value_1 != np_random_value_2
-    assert torch_random_value_1 != torch_random_value_2
-
-    # Test with a specific seed
-    Platform.seed_everything(42)
-    random_value_3 = random.randint(0, 100)
-    np_random_value_3 = np.random.randint(0, 100)
-    torch_random_value_3 = torch.randint(0, 100, (1, )).item()
-
-    Platform.seed_everything(42)
-    random_value_4 = random.randint(0, 100)
-    np_random_value_4 = np.random.randint(0, 100)
-    torch_random_value_4 = torch.randint(0, 100, (1, )).item()
-
-    assert random_value_3 == random_value_4
-    assert np_random_value_3 == np_random_value_4
-    assert torch_random_value_3 == torch_random_value_4
+    assert random_value_1 == random_value_2
+    assert np_random_value_1 == np_random_value_2
+    assert torch_random_value_1 == torch_random_value_2

From 75e6e145164c8e47a97b6e29654fe81b2fbc1ff5 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 11 Feb 2025 15:14:00 +0000
Subject: [PATCH 16/43] [V1][Metrics] Add several request timing histograms
 (#12644)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
 tests/entrypoints/openai/test_metrics.py   | 31 +++++++
 tests/v1/core/test_scheduler.py            |  3 +-
 tests/v1/engine/test_engine_core.py        |  6 +-
 tests/v1/engine/test_engine_core_client.py |  2 +
 tests/v1/engine/test_output_processor.py   | 23 +++--
 vllm/v1/core/kv_cache_manager.py           |  3 +
 vllm/v1/core/scheduler.py                  | 33 +++++++-
 vllm/v1/engine/__init__.py                 | 33 +++++++-
 vllm/v1/engine/async_llm.py                | 24 +++---
 vllm/v1/engine/core.py                     | 10 ++-
 vllm/v1/engine/core_client.py              | 19 +++--
 vllm/v1/engine/llm_engine.py               |  1 +
 vllm/v1/engine/output_processor.py         | 59 +++++++++----
 vllm/v1/metrics/loggers.py                 | 49 +++++++++++
 vllm/v1/metrics/stats.py                   | 99 ++++++++++++++++------
 vllm/v1/request.py                         | 25 ++++--
 16 files changed, 335 insertions(+), 85 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 8c1bb1a897e37..34b648b6e99d7 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -85,6 +85,10 @@ EXPECTED_VALUES = {
     "vllm:time_per_output_token_seconds":
     [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
     "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
     "vllm:request_prompt_tokens":
     [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
      ("_count", _NUM_REQUESTS)],
@@ -169,6 +173,18 @@ EXPECTED_METRICS = [
     "vllm:e2e_request_latency_seconds_sum",
     "vllm:e2e_request_latency_seconds_bucket",
     "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
     "vllm:request_prompt_tokens_sum",
     "vllm:request_prompt_tokens_bucket",
     "vllm:request_prompt_tokens_count",
@@ -220,6 +236,21 @@ EXPECTED_METRICS_V1 = [
     "vllm:time_per_output_token_seconds_sum",
     "vllm:time_per_output_token_seconds_bucket",
     "vllm:time_per_output_token_seconds_count",
+    "vllm:e2e_request_latency_seconds_sum",
+    "vllm:e2e_request_latency_seconds_bucket",
+    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
 ]
 
 
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 0d29729a454cf..8aba46aec4777 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -38,7 +38,8 @@ def create_scheduler(
     return Scheduler(scheduler_config,
                      model_config,
                      cache_config,
-                     lora_config=None)
+                     lora_config=None,
+                     log_stats=True)
 
 
 def create_requests(
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 6a91f190118fb..36b31550dc0e3 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -50,7 +50,8 @@ def test_engine_core(monkeypatch):
         executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class)
+                                 executor_class=executor_class,
+                                 log_stats=True)
         """Test basic request lifecycle."""
 
         # First request.
@@ -157,7 +158,8 @@ def test_engine_core_advanced_sampling(monkeypatch):
         executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class)
+                                 executor_class=executor_class,
+                                 log_stats=True)
         """Test basic request lifecycle."""
         # First request.
         request: EngineCoreRequest = make_request()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index b2539132f4e01..45080be8e8ce1 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -94,6 +94,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=False,
         )
 
         MAX_TOKENS = 20
@@ -163,6 +164,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=True,
         )
 
         MAX_TOKENS = 20
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index c8f43edb70b3a..1d47df417ddaa 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+import time
 from typing import Dict, List, Optional
 
 import pytest
@@ -15,6 +16,7 @@ from vllm.sequence import PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.metrics.stats import IterationStats
 
 
 def _ref_convert_id_to_token(
@@ -603,6 +605,7 @@ def test_iteration_stats(dummy_test_vectors):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
                                        log_stats=True)
     engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
+    engine_core_timestamp = time.monotonic()
 
     # Make N requests.
     requests = [
@@ -630,8 +633,9 @@ def test_iteration_stats(dummy_test_vectors):
 
     # First iteration has 2 prefills.
     outputs = engine_core.get_outputs()[:num_active]
-    processed_outputs = output_processor.process_outputs(outputs)
-    iteration_stats = processed_outputs.iteration_stats
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
     total_prompt_tokens = sum([
         len(prompt_tokens)
         for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
@@ -642,8 +646,9 @@ def test_iteration_stats(dummy_test_vectors):
 
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
-    processed_outputs = output_processor.process_outputs(outputs)
-    iteration_stats = processed_outputs.iteration_stats
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
@@ -652,8 +657,9 @@ def test_iteration_stats(dummy_test_vectors):
     output_processor.add_request(inactive_request)
     num_active += 1
     outputs = engine_core.get_outputs()[:num_active]
-    processed_outputs = output_processor.process_outputs(outputs)
-    iteration_stats = processed_outputs.iteration_stats
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
     total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
@@ -661,8 +667,9 @@ def test_iteration_stats(dummy_test_vectors):
 
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
-    processed_outputs = output_processor.process_outputs(outputs)
-    iteration_stats = processed_outputs.iteration_stats
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index f75d31f542cf7..0381e5cdd09dc 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -26,6 +26,7 @@ class KVCacheManager:
         sliding_window: Optional[int] = None,
         enable_caching: bool = True,
         num_preallocate_tokens: int = 64,
+        log_stats: bool = False,
     ) -> None:
         self.block_size = block_size
         self.num_gpu_blocks = num_gpu_blocks
@@ -33,6 +34,8 @@ class KVCacheManager:
         self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
         self.sliding_window = sliding_window
         self.enable_caching = enable_caching
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.log_stats = log_stats
         # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
         # blocks for each request. For example, when a request reaches the end
         # of its block table, we preallocate N blocks in advance. This way, we
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 985fcf01bb216..e32e557ae232c 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import time
 from collections import deque
 from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
 
@@ -10,7 +11,8 @@ from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
                                            SchedulerOutput)
-from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
+from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
+                            EngineCoreOutput, EngineCoreOutputs)
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
@@ -26,10 +28,12 @@ class Scheduler:
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
+        log_stats: bool,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
+        self.log_stats = log_stats
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -45,7 +49,8 @@ class Scheduler:
             num_gpu_blocks=num_gpu_blocks,
             max_model_len=self.max_model_len,
             sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching)
+            enable_caching=self.cache_config.enable_prefix_caching,
+            log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
 
         # req_id -> Request
@@ -107,6 +112,8 @@ class Scheduler:
         scheduled_encoder_inputs: Dict[str, List[int]] = {}
         encoder_budget = self.max_num_encoder_input_tokens
 
+        scheduled_timestamp = time.monotonic()
+
         # First, schedule the RUNNING requests.
         req_index = 0
         while req_index < len(self.running) and token_budget > 0:
@@ -246,6 +253,7 @@ class Scheduler:
                 self.running.append(request)
                 if request.status == RequestStatus.WAITING:
                     scheduled_new_reqs.append(request)
+                    self.request_scheduled(request, scheduled_timestamp)
                 elif request.status == RequestStatus.PREEMPTED:
                     scheduled_resumed_reqs.append(request)
                 else:
@@ -508,7 +516,8 @@ class Scheduler:
                         finish_reason=request.get_finished_reason(),
                         new_logprobs=new_logprobs,
                         new_prompt_logprobs_tensors=prompt_logprobs_tensors,
-                        stop_reason=request.stop_reason))
+                        stop_reason=request.stop_reason,
+                        events=request.take_events()))
 
             if not stopped:
                 new_running.append(request)
@@ -541,6 +550,7 @@ class Scheduler:
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
         self.requests[request.request_id] = request
+        self.request_queued(request)
 
     def finish_requests(
         self,
@@ -588,7 +598,22 @@ class Scheduler:
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def make_stats(self) -> SchedulerStats:
+    def request_queued(self, request: Request):
+        if not self.log_stats:
+            return
+        request.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED))
+
+    def request_scheduled(self, request: Request, timestamp: float):
+        if not self.log_stats:
+            return
+        request.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
+                                      timestamp))
+
+    def make_stats(self) -> Optional[SchedulerStats]:
+        if not self.log_stats:
+            return None
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 30e1185019d9d..782fdcee3805a 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import enum
+import time
 from typing import List, Optional, Union
 
 import msgspec
@@ -60,6 +61,30 @@ class EngineCoreRequest(
     lora_request: Optional[LoRARequest]
 
 
+class EngineCoreEventType(enum.IntEnum):
+    """The type of engine core request event."""
+    QUEUED = 1
+    SCHEDULED = 2
+
+
+class EngineCoreEvent(msgspec.Struct):
+    """A timestamped engine core event associated with a request.
+
+    The timestamp is a monotonic timestamps and is used for by the engine
+    frontend to calculate intervals between engine core events. These
+    timestamps should not be compared with timestamps from other processes.
+    """
+    type: EngineCoreEventType
+    timestamp: float
+
+    @classmethod
+    def new_event(cls,
+                  event_type: EngineCoreEventType,
+                  timestamp: Optional[float] = None) -> "EngineCoreEvent":
+        timestamp = time.monotonic() if timestamp is None else timestamp
+        return cls(event_type, timestamp)
+
+
 class EngineCoreOutput(
         msgspec.Struct,
         array_like=True,  # type: ignore[call-arg]
@@ -74,6 +99,7 @@ class EngineCoreOutput(
 
     finish_reason: Optional[FinishReason] = None
     stop_reason: Union[int, str, None] = None
+    events: Optional[List[EngineCoreEvent]] = None
 
     @property
     def finished(self) -> bool:
@@ -91,7 +117,12 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
-    scheduler_stats: SchedulerStats
+    scheduler_stats: Optional[SchedulerStats]
+    timestamp: float = 0.0
+
+    def __post_init__(self):
+        if self.timestamp == 0.0:
+            self.timestamp = time.monotonic()
 
 
 class EngineCoreRequestType(enum.Enum):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3c4e35e4aa274..f19d2ed8bcb6c 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -53,10 +53,12 @@ class AsyncLLM(EngineClient):
 
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers: List[StatLoggerBase] = [
-            LoggingStatLogger(),
-            PrometheusStatLogger(vllm_config.model_config),
-        ]
+        self.stat_loggers: List[StatLoggerBase] = []
+        if self.log_stats:
+            self.stat_loggers.extend([
+                LoggingStatLogger(),
+                PrometheusStatLogger(vllm_config.model_config),
+            ])
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -85,6 +87,7 @@ class AsyncLLM(EngineClient):
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
@@ -246,6 +249,8 @@ class AsyncLLM(EngineClient):
                 # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
 
+                iteration_stats = IterationStats() if self.log_stats else None
+
                 # Split outputs into chunks of at most
                 # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                 # event loop for too long.
@@ -257,14 +262,12 @@ class AsyncLLM(EngineClient):
                         outputs.outputs,
                         cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
 
-                iteration_stats = None
                 for i, outputs_slice in enumerate(slices):
                     # 2) Process EngineCoreOutputs.
                     processed_outputs = self.output_processor.process_outputs(
-                        outputs_slice, iteration_stats)
+                        outputs_slice, outputs.timestamp, iteration_stats)
                     # NOTE: RequestOutputs are pushed to their queues.
                     assert not processed_outputs.request_outputs
-                    iteration_stats = processed_outputs.iteration_stats
 
                     # Allow other asyncio tasks to run between chunks
                     if i + 1 < len(slices):
@@ -277,7 +280,6 @@ class AsyncLLM(EngineClient):
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once Prometheus overhead is non-trivial.
-                assert iteration_stats is not None
                 self._log_stats(
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
@@ -299,12 +301,14 @@ class AsyncLLM(EngineClient):
 
     def _log_stats(
         self,
-        scheduler_stats: SchedulerStats,
-        iteration_stats: IterationStats,
+        scheduler_stats: Optional[SchedulerStats],
+        iteration_stats: Optional[IterationStats],
     ):
         if not self.log_stats:
             return
 
+        assert scheduler_stats is not None
+        assert iteration_stats is not None
         for logger in self.stat_loggers:
             logger.log(scheduler_stats=scheduler_stats,
                        iteration_stats=iteration_stats)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index c90667ba0331e..e4677681bd2bb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -38,12 +38,15 @@ class EngineCore:
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
+        log_stats: bool,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
 
+        self.log_stats = log_stats
+
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
 
@@ -59,6 +62,7 @@ class EngineCore:
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
+            log_stats=self.log_stats,
         )
 
         self.mm_input_mapper_server = MMInputMapperServer(
@@ -148,11 +152,9 @@ class EngineCoreProc(EngineCore):
         ready_pipe: Connection,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
+        log_stats: bool,
     ):
-        super().__init__(vllm_config, executor_class)
-
-        self.log_stats = log_stats
+        super().__init__(vllm_config, executor_class, log_stats)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 2d7d6b42ced52..b3de5cdc244f3 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -41,6 +41,7 @@ class EngineCoreClient(ABC):
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
+        log_stats: bool,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -50,12 +51,12 @@ class EngineCoreClient(ABC):
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(vllm_config, executor_class)
+            return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(vllm_config, executor_class)
+            return SyncMPClient(vllm_config, executor_class, log_stats)
 
-        return InprocClient(vllm_config, executor_class)
+        return InprocClient(vllm_config, executor_class, log_stats)
 
     @abstractmethod
     def shutdown(self):
@@ -204,13 +205,13 @@ class MPClient(EngineCoreClient):
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig,
-                 executor_class: Type[Executor]):
+    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
+                 log_stats: bool):
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=False,
+            log_stats=log_stats,
         )
 
     def get_output(self) -> EngineCoreOutputs:
@@ -245,13 +246,13 @@ class SyncMPClient(MPClient):
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig,
-                 executor_class: Type[Executor]):
+    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
+                 log_stats: bool):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=True,
+            log_stats=log_stats,
         )
 
         self.outputs_queue: Optional[asyncio.Queue[bytes]] = None
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 3ef5a9706063a..c9a4c5369dfd8 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -73,6 +73,7 @@ class LLMEngine:
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=False,  # FIXME: implement
         )
 
     @classmethod
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 5dbf530caa17a..7973c62c381ff 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -19,7 +19,6 @@ class OutputProcessorOutput:
 
     request_outputs: List[RequestOutput]
     reqs_to_abort: List[str]
-    iteration_stats: IterationStats
 
 
 class RequestState:
@@ -34,6 +33,7 @@ class RequestState:
         detokenizer: IncrementalDetokenizer,
         arrival_time: float,
         queue: Optional[asyncio.Queue[RequestOutput]],
+        log_stats: bool,
     ):
         self.request_id = request_id
         self.output_kind = output_kind
@@ -45,14 +45,16 @@ class RequestState:
         self.is_prefilling = True
         self.queue = queue
 
-        self.stats = RequestStateStats(last_token_time=arrival_time)
+        self.stats = RequestStateStats(
+            arrival_time=arrival_time) if log_stats else None
 
     @classmethod
     def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
         request: EngineCoreRequest,
-        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+        queue: Optional[asyncio.Queue[RequestOutput]],
+        log_stats: bool,
     ) -> "RequestState":
         return cls(
             request_id=request.request_id,
@@ -69,6 +71,7 @@ class RequestState:
             ),
             arrival_time=request.arrival_time,
             queue=queue,
+            log_stats=log_stats,
         )
 
 
@@ -112,11 +115,13 @@ class OutputProcessor:
         self.request_states[request_id] = RequestState.from_new_request(
             tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
             request=request,
-            queue=queue)
+            queue=queue,
+            log_stats=self.log_stats)
 
     def process_outputs(
         self,
         engine_core_outputs: List[EngineCoreOutput],
+        engine_core_timestamp: Optional[float] = None,
         iteration_stats: Optional[IterationStats] = None,
     ) -> OutputProcessorOutput:
         """
@@ -145,8 +150,6 @@ class OutputProcessor:
 
         request_outputs: List[RequestOutput] = []
         reqs_to_abort: List[str] = []
-        if not iteration_stats:
-            iteration_stats = IterationStats(self.log_stats)
         for engine_core_output in engine_core_outputs:
             req_id = engine_core_output.request_id
             req_state = self.request_states.get(req_id)
@@ -155,10 +158,9 @@ class OutputProcessor:
                 continue
 
             # 1) Compute stats for this iteration.
-            iteration_stats.update_from_output(engine_core_output,
-                                               req_state.is_prefilling,
-                                               req_state.prompt_len,
-                                               req_state.stats)
+            self._update_stats_from_output(req_state, engine_core_output,
+                                           engine_core_timestamp,
+                                           iteration_stats)
 
             new_token_ids = engine_core_output.new_token_ids
             finish_reason = engine_core_output.finish_reason
@@ -205,17 +207,44 @@ class OutputProcessor:
                         # detected stop string, abort needed in EngineCore.
                         reqs_to_abort.append(req_id)
 
-                    # Track per-request stats.
-                    assert finish_reason is not None
-                    iteration_stats.update_from_finished_request(
-                        finish_reason, request_output, req_state.stats)
+                    # Track per-request stats
+                    self._update_stats_from_finished(req_state, request_output,
+                                                     finish_reason,
+                                                     iteration_stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
-            iteration_stats=iteration_stats,
         )
 
+    def _update_stats_from_output(self, req_state: RequestState,
+                                  engine_core_output: EngineCoreOutput,
+                                  engine_core_timestamp: Optional[float],
+                                  iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+
+        assert engine_core_timestamp is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_output(engine_core_output,
+                                           engine_core_timestamp,
+                                           req_state.is_prefilling,
+                                           req_state.prompt_len,
+                                           req_state.stats)
+
+    def _update_stats_from_finished(self, req_state: RequestState,
+                                    request_output: RequestOutput,
+                                    finish_reason: Optional[FinishReason],
+                                    iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+
+        assert finish_reason is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_finished_request(finish_reason,
+                                                     request_output,
+                                                     req_state.stats)
+
     @staticmethod
     def _make_request_output(
         request_state: RequestState,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 3472761dc1808..439be38a3e795 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -182,6 +182,45 @@ class PrometheusStatLogger(StatLoggerBase):
                 ],
                 labelnames=labelnames).labels(*labelvalues)
 
+        request_latency_buckets = [
+            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
+            40.0, 50.0, 60.0
+        ]
+        self.histogram_e2e_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:e2e_request_latency_seconds",
+                documentation="Histogram of e2e request latency in seconds.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_queue_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_queue_time_seconds",
+                documentation=
+                "Histogram of time spent in WAITING phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_inference_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_inference_time_seconds",
+                documentation=
+                "Histogram of time spent in RUNNING phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_prefill_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_prefill_time_seconds",
+                documentation=
+                "Histogram of time spent in PREFILL phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_decode_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_decode_time_seconds",
+                documentation=
+                "Histogram of time spent in DECODE phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+
     def log(self, scheduler_stats: SchedulerStats,
             iteration_stats: IterationStats):
         """Log to prometheus."""
@@ -201,6 +240,12 @@ class PrometheusStatLogger(StatLoggerBase):
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason].inc()
+            self.histogram_e2e_time_request.observe(
+                finished_request.e2e_latency)
+            self.histogram_inference_time_request.observe(
+                finished_request.inference_time)
+            self.histogram_decode_time_request.observe(
+                finished_request.decode_time)
             self.histogram_num_prompt_tokens_request.observe(
                 finished_request.num_prompt_tokens)
             self.histogram_num_generation_tokens_request.observe(
@@ -210,6 +255,10 @@ class PrometheusStatLogger(StatLoggerBase):
             self.histogram_time_to_first_token.observe(ttft)
         for tpot in iteration_stats.time_per_output_tokens_iter:
             self.histogram_time_per_output_token.observe(tpot)
+        for queue_time in iteration_stats.queue_times_iter:
+            self.histogram_queue_time_request.observe(queue_time)
+        for prefill_time in iteration_stats.prefill_times_iter:
+            self.histogram_prefill_time_request.observe(prefill_time)
 
     @staticmethod
     def _unregister_vllm_metrics():
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index f806b0adf5d5a..a0e6204929eb3 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
-    from vllm.v1.engine import EngineCoreOutput, FinishReason
+    from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
 
 
 @dataclass
@@ -41,7 +41,15 @@ class RequestStateStats:
     """Stats that need to be tracked across delta updates."""
 
     num_generation_tokens: int = 0
-    last_token_time: float = 0.0
+
+    # This is a engine frontend timestamp (wall-clock)
+    arrival_time: float = 0.0
+
+    # These are engine core timestamps (monotonic)
+    queued_ts: float = 0.0
+    scheduled_ts: float = 0.0
+    first_token_ts: float = 0.0
+    last_token_ts: float = 0.0
 
 
 @dataclass
@@ -49,33 +57,37 @@ class FinishedRequestStats:
     """Stats associated with a finished request."""
 
     finish_reason: "FinishReason"
+    e2e_latency: float = 0.0
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
+    inference_time: float = 0.0
+    decode_time: float = 0.0
 
 
 class IterationStats:
     """Stats associated with a single set of EngineCoreOutputs."""
 
-    def __init__(self, log_stats: bool):
-        self.log_stats = log_stats
+    def __init__(self):
+        self.iteration_timestamp = time.time()
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
         self.finished_requests: List[FinishedRequestStats] = []
         self.time_to_first_tokens_iter: List[float] = []
         self.time_per_output_tokens_iter: List[float] = []
+        self.queue_times_iter: List[float] = []
+        self.prefill_times_iter: List[float] = []
+
+    def _time_since(self, start: float) -> float:
+        """Calculate an interval relative to this iteration's timestamp."""
+        return self.iteration_timestamp - start
 
     def update_from_output(self, output: "EngineCoreOutput",
-                           is_prefilling: bool, prompt_len: int,
-                           request_state_stats: RequestStateStats):
-        if not self.log_stats:
-            return
-
+                           engine_core_timestamp: float, is_prefilling: bool,
+                           prompt_len: int, req_stats: RequestStateStats):
         num_new_generation_tokens = len(output.new_token_ids)
-        now = time.time()
-        last_token_latency = now - request_state_stats.last_token_time
 
         self.num_generation_tokens += num_new_generation_tokens
-        if is_prefilling:
+        if is_prefilling and num_new_generation_tokens > 0:
             # TODO(andy): we used to assert that num_new_generation_tokens
             # > 0 with an invariant that EngineCore does not stream outputs
             # for partially completed prefills (scheduler.update_from_output
@@ -84,19 +96,58 @@ class IterationStats:
             # partially completed prompt.
             # This will be reverted in a follow up PR and we should re-enable
             # this assertion / invariant.
-            if num_new_generation_tokens > 0:
-                self.num_prompt_tokens += prompt_len
-                self.time_to_first_tokens_iter.append(last_token_latency)
-        else:
-            self.time_per_output_tokens_iter.append(last_token_latency)
+            self.num_prompt_tokens += prompt_len
 
-        request_state_stats.num_generation_tokens += num_new_generation_tokens
-        request_state_stats.last_token_time = now
+            first_token_latency = self._time_since(req_stats.arrival_time)
+            self.time_to_first_tokens_iter.append(first_token_latency)
+
+        req_stats.num_generation_tokens += num_new_generation_tokens
+
+        # Process request-level engine core events
+        if output.events is not None:
+            self.update_from_events(output.events, is_prefilling, req_stats)
+
+        # Process the batch-level "new tokens" engine core event
+        if is_prefilling:
+            # TODO: re-enable no-output-for-partial-prefills invariant as above
+            if num_new_generation_tokens > 0:
+                prefill_interval = \
+                    engine_core_timestamp - req_stats.scheduled_ts
+                self.prefill_times_iter.append(prefill_interval)
+                req_stats.first_token_ts = engine_core_timestamp
+        else:
+            tpot = engine_core_timestamp - req_stats.last_token_ts
+            self.time_per_output_tokens_iter.append(tpot)
+
+        # TODO: re-enable no-output-for-partial-prefills invariant as above
+        if num_new_generation_tokens > 0:
+            req_stats.last_token_ts = engine_core_timestamp
+
+    def update_from_events(self, events: List["EngineCoreEvent"],
+                           is_prefilling: bool, req_stats: RequestStateStats):
+        # Avoid circular dependency
+        from vllm.v1.engine import EngineCoreEventType
+        for event in events:
+            if event.type == EngineCoreEventType.QUEUED:
+                req_stats.queued_ts = event.timestamp
+            elif event.type == EngineCoreEventType.SCHEDULED:
+                queued_interval = event.timestamp - req_stats.queued_ts
+                self.queue_times_iter.append(queued_interval)
+                req_stats.scheduled_ts = event.timestamp
 
     def update_from_finished_request(self, finish_reason: "FinishReason",
                                      request_output: "RequestOutput",
-                                     request_state_stats: RequestStateStats):
-        self.finished_requests.append(
-            FinishedRequestStats(finish_reason,
-                                 len(request_output.prompt_token_ids),
-                                 request_state_stats.num_generation_tokens))
+                                     req_stats: RequestStateStats):
+        e2e_latency = self._time_since(req_stats.arrival_time)
+
+        inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
+        decode_time = req_stats.last_token_ts - req_stats.first_token_ts
+
+        finished_req = \
+            FinishedRequestStats(finish_reason=finish_reason,
+                                 e2e_latency=e2e_latency,
+                                 num_prompt_tokens=len(request_output.prompt_token_ids),
+                                 num_generation_tokens=req_stats.num_generation_tokens,
+                                 inference_time=inference_time,
+                                 decode_time=decode_time)
+        self.finished_requests.append(finished_req)
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index bb4d2c19197bc..0ebaa71ce74cd 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, List, Optional, Union
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest, FinishReason
+from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
+                            EngineCoreRequest, FinishReason)
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -33,14 +33,10 @@ class Request:
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
-        self.metrics = RequestMetrics(arrival_time=arrival_time,
-                                      last_token_time=arrival_time,
-                                      first_scheduled_time=None,
-                                      first_token_time=None,
-                                      time_in_queue=None)
         self.lora_request = lora_request
 
         self.status = RequestStatus.WAITING
+        self.events: List[EngineCoreEvent] = []
         self.stop_reason: Union[int, str, None] = None
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
@@ -83,6 +79,21 @@ class Request:
             lora_request=request.lora_request,
         )
 
+    def queued(self, timestamp: Optional[float] = None) -> None:
+        self.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED, timestamp))
+
+    def scheduled(self, timestamp: Optional[float] = None) -> None:
+        self.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
+                                      timestamp))
+
+    def take_events(self) -> Optional[List[EngineCoreEvent]]:
+        if not self.events:
+            return None
+        events, self.events = self.events, []
+        return events
+
     def append_output_token_ids(
         self,
         token_ids: Union[int, List[int]],

From ad9776353e6b00d019415e94fd17c78ad4575ff7 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 11 Feb 2025 15:51:19 +0000
Subject: [PATCH 17/43] Set `torch_dtype` in `TransformersModel` (#13088)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 43d2c88d3b9ca..1605467bc3dd6 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -143,6 +143,7 @@ class TransformersModel(nn.Module):
         self.model: PreTrainedModel = AutoModel.from_config(
             self.config,
             attn_implementation="vllm",
+            torch_dtype=vllm_config.model_config.dtype,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
         )
         prefix = self.model.base_model_prefix

From bf3e05215c7f20baf9fcd82d8877d8453dcebf6e Mon Sep 17 00:00:00 2001
From: Jewon Lee <105219284+je1lee@users.noreply.github.com>
Date: Wed, 12 Feb 2025 01:20:37 +0900
Subject: [PATCH 18/43] [Misc] Fix typo at comments at metrics.py (#13024)

---
 vllm/engine/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index ce806b4a937a1..7c55d66e50777 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -237,7 +237,7 @@ class Metrics:
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + [Metrics.labelname_finish_reason])
 
-        # Speculatie decoding stats
+        # Speculative decoding stats
         self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
             name="vllm:spec_decode_draft_acceptance_rate",
             documentation="Speulative token acceptance rate.",

From 21f5d50fa557f431e9c76d432771337f5399c420 Mon Sep 17 00:00:00 2001
From: MoonRide303 <130458190+MoonRide303@users.noreply.github.com>
Date: Tue, 11 Feb 2025 17:21:18 +0100
Subject: [PATCH 19/43] [Bugfix] Do not use resource module on Windows (#12858)
 (#13029)

---
 vllm/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index e168752766661..6a41afff8f04c 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -15,7 +15,6 @@ import ipaddress
 import multiprocessing
 import os
 import re
-import resource
 import signal
 import socket
 import subprocess
@@ -2070,6 +2069,11 @@ def memory_profiling(
 
 # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
 def set_ulimit(target_soft_limit=65535):
+    if sys.platform.startswith('win'):
+        logger.info("Windows detected, skipping ulimit adjustment.")
+        return
+
+    import resource
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
 

From 6c4dbe23eb85e5d1da00ccaf4923a275d8769a7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?=
 =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?=
 <hollowman@opensuse.org>
Date: Tue, 11 Feb 2025 18:21:50 +0200
Subject: [PATCH 20/43] [BugFix] Pop instead of del CUDA_VISIBLE_DEVICES
 (#12962)

Signed-off-by: Hollow Man <hollowman@opensuse.org>
---
 examples/offline_inference/rlhf.py          |  2 +-
 examples/offline_inference/rlhf_colocate.py |  2 +-
 tests/distributed/test_comm_ops.py          | 10 +++++-----
 tests/distributed/test_custom_all_reduce.py |  4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 5000251c099f7..172d18cbce2f9 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -92,7 +92,7 @@ class MyLLM(LLM):
         # a hack to make the script work.
         # stop ray from manipulating CUDA_VISIBLE_DEVICES
         # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
         super().__init__(*args, **kwargs)
 
 
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index b921bc71feb99..15dc7edc18ad9 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -59,7 +59,7 @@ class MyLLM(LLM):
         # a hack to make the script work.
         # stop ray from manipulating CUDA_VISIBLE_DEVICES
         # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
         # every worker will use 0.4 GPU, so that we can schedule
         # 2 instances on the same GPUs.
         os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index bc916e8de07c4..7b0346b8ab50f 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
                                       distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
                           distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 46887bca42a90..4928690bebb07 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -24,7 +24,7 @@ for i, v in enumerate(test_sizes):
 
 @ray.remote(num_gpus=1, max_calls=1)
 def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
 
 @ray.remote(num_gpus=1, max_calls=1)
 def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,

From 2b25b7d2e1bd915dde2890e7a923958c8d1eb8e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Tue, 11 Feb 2025 17:38:48 +0100
Subject: [PATCH 21/43] Fix initializing GGUF weights for ColumnParallelLinear
 when using tensor parallel > 1 (#13023)

---
 vllm/model_executor/layers/linear.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index da8db08fe7152..dad16112082cb 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -335,6 +335,12 @@ class ColumnParallelLinear(LinearBase):
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
 
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
         # Special case for GGUF
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
         is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
@@ -343,13 +349,12 @@ class ColumnParallelLinear(LinearBase):
 
         # Materialize GGUF UninitializedParameter
         if is_gguf_weight and isinstance(param, UninitializedParameter):
-            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
-
-        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
-        is_sharded_weight = getattr(param, "is_sharded_weight", False)
-        # bitsandbytes loads the weights of the specific portion
-        # no need to narrow
-        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+            final_shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                tp_size = get_tensor_model_parallel_world_size()
+                assert final_shape[output_dim] % tp_size == 0
+                final_shape[output_dim] = final_shape[output_dim] // tp_size
+            param.materialize(final_shape, dtype=loaded_weight.dtype)
 
         param_data = param.data
         if output_dim is not None and not is_sharded_weight:

From 565c1efa65358f43a78a52296d658651dd2b8f36 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 12 Feb 2025 00:55:56 +0800
Subject: [PATCH 22/43] [CI/Build][Bugfix] Fix CPU backend default threads num
 (#13077)

---
 vllm/platforms/cpu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 179ee6a7d2478..a9216c2322e92 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -115,6 +115,9 @@ class CpuPlatform(Platform):
         # Environment variables for CPU executor
         #
 
+        # Set default threads num for OpenMP parallel
+        os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 

From deb6c1c6b4469984eb2a032099081f7f9e4ec8a8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 11 Feb 2025 18:02:46 +0000
Subject: [PATCH 23/43] [Doc] Improve OpenVINO installation doc (#13102)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../installation/ai_accelerator/openvino.inc.md           | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index 112e8d4d9b256..4f25252d9daff 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -19,17 +19,19 @@ Currently, there are no pre-built OpenVINO wheels.
 
 ### Build wheel from source
 
-First, install Python. For example, on Ubuntu 22.04, you can run:
+First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run:
 
 ```console
 sudo apt-get update  -y
 sudo apt-get install python3
+pip install --upgrade pip
 ```
 
-Second, install prerequisites vLLM OpenVINO backend installation:
+Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
 
 ```console
-pip install --upgrade pip
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
 pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 

From 14ecab5be21b2af4ab1bbc6309d558ec620badc6 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Tue, 11 Feb 2025 13:17:44 -0500
Subject: [PATCH 24/43] [Bugfix] Guided decoding falls back to outlines when
 fails to import xgrammar (#12976)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 vllm/model_executor/guided_decoding/__init__.py          | 9 +++++++++
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index cf96461a549f3..3eb7d186eb009 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -40,6 +40,8 @@ def maybe_backend_fallback(
             guided_params.backend = "outlines"
 
     if guided_params.backend == "xgrammar":
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+            xgr_installed)
         # xgrammar only has x86 wheels for linux, fallback to outlines
         from vllm.platforms import current_platform
         if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
@@ -77,6 +79,13 @@ def maybe_backend_fallback(
                     "Falling back to use outlines instead.")
                 guided_params.backend = "outlines"
 
+        # If the xgrammar module cannot be imported successfully,
+        # we should still allow users to use guided decoding with a fallback.
+        elif not xgr_installed:
+            logger.warning("xgrammar module cannot be imported successfully. "
+                           "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
+
     if (guided_params.backend == "outlines"
             and guided_params.json_object is not None):
         # outlines doesn't support json_object, fallback to xgrammar
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index c01bd3af1d5b9..fc3a4cd4bebc8 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -14,7 +14,9 @@ from transformers import PreTrainedTokenizerFast
 try:
     import xgrammar as xgr
     from xgrammar.base import _core as xgr_core
+    xgr_installed = True
 except ImportError:
+    xgr_installed = False
     pass
 
 from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,

From 72c2b68dc9d4fb20eb135c22ee8c86caca48d28b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 11 Feb 2025 17:34:16 -0500
Subject: [PATCH 25/43] [Misc] Move pre-commit suggestion back to the end
 (#13114)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .pre-commit-config.yaml | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 352eb2df01b98..22b51afdc57a6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -116,13 +116,6 @@ repos:
     language: python
     types: [python]
     exclude: 'vllm/third_party/.*'
-  - id: suggestion
-    name: Suggestion
-    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
-    language: system
-    verbose: true
-    pass_filenames: false
-    exclude: 'vllm/third_party/.*'
   - id: check-filenames
     name: Check for spaces in all filenames
     entry: bash
@@ -133,3 +126,12 @@ repos:
     always_run: true
     pass_filenames: false
     exclude: 'vllm/third_party/.*'
+  # Keep `suggestion` last
+  - id: suggestion
+    name: Suggestion
+    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
+    language: system
+    verbose: true
+    pass_filenames: false
+    exclude: 'vllm/third_party/.*'
+  # Insert new entries above the `suggestion` entry

From 3ee696a63dd0c2acee44809a3bedec33ea27dfa0 Mon Sep 17 00:00:00 2001
From: Keyun Tong <tongkeyun@gmail.com>
Date: Tue, 11 Feb 2025 20:25:58 -0800
Subject: [PATCH 26/43] [RFC][vllm-API] Support tokenizer registry for
 customized tokenizer in vLLM (#12518)

Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
---
 benchmarks/benchmark_serving.py               |   5 +-
 tests/tokenization/test_tokenizer_registry.py | 123 +++++++++++++++
 vllm/config.py                                |   9 +-
 vllm/engine/arg_utils.py                      |   6 +-
 vllm/entrypoints/llm.py                       |  31 ++--
 vllm/entrypoints/openai/serving_engine.py     |   3 +-
 vllm/entrypoints/openai/serving_score.py      |   2 +-
 vllm/logits_process.py                        |   2 +-
 vllm/transformers_utils/tokenizer.py          |  18 ++-
 vllm/transformers_utils/tokenizer_base.py     | 146 ++++++++++++++++++
 vllm/transformers_utils/tokenizers/mistral.py |  39 +++--
 11 files changed, 343 insertions(+), 41 deletions(-)
 create mode 100644 tests/tokenization/test_tokenizer_registry.py
 create mode 100644 vllm/transformers_utils/tokenizer_base.py

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 0c892384236bc..90eb052399bf0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1275,11 +1275,12 @@ if __name__ == "__main__":
         '--tokenizer-mode',
         type=str,
         default="auto",
-        choices=['auto', 'slow', 'mistral'],
+        choices=['auto', 'slow', 'mistral', 'custom'],
         help='The tokenizer mode.\n\n* "auto" will use the '
         'fast tokenizer if available.\n* "slow" will '
         'always use the slow tokenizer. \n* '
-        '"mistral" will always use the `mistral_common` tokenizer.')
+        '"mistral" will always use the `mistral_common` tokenizer. \n*'
+        '"custom" will use --tokenizer to select the preregistered tokenizer.')
 
     parser.add_argument("--served-model-name",
                         type=str,
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
new file mode 100644
index 0000000000000..793d38f9c3666
--- /dev/null
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
+                                                    TokenizerRegistry)
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TestTokenizer(TokenizerBase):
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
+        return TestTokenizer()
+
+    @property
+    def all_special_tokens_extended(self) -> List[str]:
+        raise NotImplementedError()
+
+    @property
+    def all_special_tokens(self) -> List[str]:
+        raise NotImplementedError()
+
+    @property
+    def all_special_ids(self) -> List[int]:
+        raise NotImplementedError()
+
+    @property
+    def bos_token_id(self) -> int:
+        return 0
+
+    @property
+    def eos_token_id(self) -> int:
+        return 1
+
+    @property
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def is_fast(self) -> bool:
+        raise NotImplementedError()
+
+    @property
+    def vocab_size(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    def max_token_id(self) -> int:
+        raise NotImplementedError()
+
+    def __call__(
+        self,
+        text: Union[str, List[str], List[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        raise NotImplementedError()
+
+    def get_vocab(self) -> Dict[str, int]:
+        raise NotImplementedError()
+
+    def get_added_vocab(self) -> Dict[str, int]:
+        raise NotImplementedError()
+
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> List[int]:
+        raise NotImplementedError()
+
+    def encode(self,
+               text: str,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
+        raise NotImplementedError()
+
+    def apply_chat_template(self,
+                            messages: List["ChatCompletionMessageParam"],
+                            tools: Optional[List[Dict[str, Any]]] = None,
+                            **kwargs) -> List[int]:
+        raise NotImplementedError()
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        raise NotImplementedError()
+
+    def decode(self,
+               ids: Union[List[int], int],
+               skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError()
+
+    def convert_ids_to_tokens(
+        self,
+        ids: List[int],
+        skip_special_tokens: bool = True,
+    ) -> List[str]:
+        raise NotImplementedError()
+
+
+def test_customized_tokenizer():
+    TokenizerRegistry.register("test_tokenizer",
+                               "tests.tokenization.test_tokenizer_registry",
+                               "TestTokenizer")
+
+    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
+
+    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
diff --git a/vllm/config.py b/vllm/config.py
index 1d8c42dd276a6..1740871e7c107 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -102,8 +102,9 @@ class ModelConfig:
             it; otherwise, you must specify explicitly which task to use.
         tokenizer: Name or path of the huggingface tokenizer to use.
         tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
-            available, "slow" will always use the slow tokenizer, and
-            "mistral" will always use the tokenizer from `mistral_common`.
+            available, "slow" will always use the slow tokenizer,
+            "mistral" will always use the tokenizer from `mistral_common`, and
+            "custom" will use --tokenizer to select the preregistered tokenizer.
         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
             downloading the model and tokenizer.
         allowed_local_media_path: Allowing API requests to read local images or
@@ -467,10 +468,10 @@ class ModelConfig:
 
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = self.tokenizer_mode.lower()
-        if tokenizer_mode not in ["auto", "slow", "mistral"]:
+        if tokenizer_mode not in ["auto", "slow", "mistral", "custom"]:
             raise ValueError(
                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                "either 'auto', 'slow' or 'mistral'.")
+                "either 'auto', 'slow', 'mistral' or 'custom'.")
         self.tokenizer_mode = tokenizer_mode
 
     def _get_preferred_task(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4232ad9204f44..83ee6b97f9387 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -284,11 +284,13 @@ class EngineArgs:
             '--tokenizer-mode',
             type=str,
             default=EngineArgs.tokenizer_mode,
-            choices=['auto', 'slow', 'mistral'],
+            choices=['auto', 'slow', 'mistral', 'custom'],
             help='The tokenizer mode.\n\n* "auto" will use the '
             'fast tokenizer if available.\n* "slow" will '
             'always use the slow tokenizer. \n* '
-            '"mistral" will always use the `mistral_common` tokenizer.')
+            '"mistral" will always use the `mistral_common` tokenizer. \n* '
+            '"custom" will use --tokenizer to select the '
+            'preregistered tokenizer.')
         parser.add_argument('--trust-remote-code',
                             action='store_true',
                             help='Trust remote code from huggingface.')
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index d071a0b3cfc5d..73593f0c6f0a5 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1051,9 +1051,9 @@ class LLM:
 
     def _cross_encoding_score(
         self,
-        tokenizer: Union[AnyTokenizer],
-        text_1: List[Union[str, TextPrompt, TokensPrompt]],
-        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        tokenizer: AnyTokenizer,
+        text_1: List[str],
+        text_2: List[str],
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
@@ -1176,29 +1176,36 @@ class LLM:
         if isinstance(text_1, (str, dict)):
             # Convert a single prompt to a list.
             text_1 = [text_1]
-        text_1 = [ensure_str(t) for t in text_1]
+        input_text_1: List[str] = [ensure_str(t) for t in text_1]
 
         if isinstance(text_2, (str, dict)):
             # Convert a single prompt to a list.
             text_2 = [text_2]
-        text_2 = [ensure_str(t) for t in text_2]
+        input_text_2: List[str] = [ensure_str(t) for t in text_2]
 
-        if len(text_1) > 1 and len(text_1) != len(text_2):
+        if len(input_text_1) > 1 and len(input_text_1) != len(input_text_2):
             raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
-        if len(text_1) == 0:
+        if len(input_text_1) == 0:
             raise ValueError("At least one text element must be given")
-        if len(text_2) == 0:
+        if len(input_text_2) == 0:
             raise ValueError("At least one text_pair element must be given")
 
         if self.llm_engine.model_config.is_cross_encoder:
-            return self._cross_encoding_score(tokenizer, text_1, text_2,
+            return self._cross_encoding_score(tokenizer, input_text_1,
+                                              input_text_2,
                                               truncate_prompt_tokens, use_tqdm,
                                               lora_request,
                                               prompt_adapter_request)
         else:
-            return self._embedding_score(tokenizer, text_1, text_2,
-                                         truncate_prompt_tokens, use_tqdm,
-                                         lora_request, prompt_adapter_request)
+
+            return self._embedding_score(
+                tokenizer,
+                input_text_1,  # type: ignore[arg-type]
+                input_text_2,  # type: ignore[arg-type]
+                truncate_prompt_tokens,
+                use_tqdm,
+                lora_request,
+                prompt_adapter_request)
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8d39fdcb74833..9efb5e6fa3987 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -400,8 +400,7 @@ class OpenAIServing:
         _chat_template_kwargs.update(chat_template_kwargs or {})
 
         request_prompt: Union[str, List[int]]
-        is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
-        if is_mistral_tokenizer:
+        if isinstance(tokenizer, MistralTokenizer):
             request_prompt = apply_mistral_chat_template(
                 tokenizer,
                 messages=messages,
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 832aa8516cc35..c7597808f7fe3 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -121,7 +121,7 @@ class OpenAIServingScores(OpenAIServing):
 
                 tokenize_async = make_async(tokenizer.__call__,
                                             executor=self._tokenizer_executor)
-                prompt_inputs = await tokenize_async(text=q,
+                prompt_inputs = await tokenize_async(q,
                                                      text_pair=t,
                                                      **tokenization_kwargs)
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index d02072e8f8189..a810be7bc7a85 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -31,7 +31,7 @@ def get_bad_words_logits_processors(
 
             if isinstance(tokenizer, MistralTokenizer):
                 # Mistral tokenizers should not add special tokens
-                prompt_token_ids = tokenizer.encode(prompt=prompt)
+                prompt_token_ids = tokenizer.encode(text=prompt)
             else:
                 prompt_token_ids = tokenizer.encode(text=prompt,
                                                     add_special_tokens=False)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 520870b563c9e..0c0f68ac123e2 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -14,6 +14,8 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
+                                                    TokenizerRegistry)
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import make_async
@@ -21,7 +23,7 @@ from vllm.utils import make_async
 logger = init_logger(__name__)
 
 AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
-                     MistralTokenizer]
+                     TokenizerBase]
 
 
 def decode_tokens(
@@ -47,11 +49,7 @@ def encode_tokens(
     Backend-agnostic equivalent of HF's
     :code:`tokenizer.encode(text, add_special_tokens=...)`.
     """
-    if isinstance(tokenizer, MistralTokenizer):
-        return tokenizer.tokenizer.encode(text,
-                                          bos=add_special_tokens,
-                                          eos=add_special_tokens)
-    elif add_special_tokens is not None:
+    if add_special_tokens is not None:
         return tokenizer.encode(text, add_special_tokens=add_special_tokens)
     return tokenizer.encode(text)
 
@@ -183,9 +181,17 @@ def get_tokenizer(
             'encoding and decoding.',
             FutureWarning,
             stacklevel=2)
+
+    tokenizer: AnyTokenizer
     if tokenizer_mode == "mistral":
         tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
                                                      revision=revision)
+    elif tokenizer_mode == "custom":
+        tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
+                                                    *args,
+                                                    revision=revision,
+                                                    download_dir=download_dir,
+                                                    **kwargs)
     else:
         try:
             tokenizer = AutoTokenizer.from_pretrained(
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
new file mode 100644
index 0000000000000..bb5ddaf88b219
--- /dev/null
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import importlib
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerBase(ABC):
+
+    @property
+    @abstractmethod
+    def all_special_tokens_extended(self) -> List[str]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def all_special_tokens(self) -> List[str]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def all_special_ids(self) -> List[int]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def bos_token_id(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def eos_token_id(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def is_fast(self) -> bool:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def vocab_size(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def max_token_id(self) -> int:
+        raise NotImplementedError()
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    @abstractmethod
+    def __call__(
+        self,
+        text: Union[str, List[str], List[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_vocab(self) -> Dict[str, int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_added_vocab(self) -> Dict[str, int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> List[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode(self,
+               text: str,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def apply_chat_template(self,
+                            messages: List["ChatCompletionMessageParam"],
+                            tools: Optional[List[Dict[str, Any]]] = None,
+                            **kwargs) -> List[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def decode(self,
+               ids: Union[List[int], int],
+               skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_ids_to_tokens(
+        self,
+        ids: List[int],
+        skip_special_tokens: bool = True,
+    ) -> List[str]:
+        raise NotImplementedError()
+
+
+class TokenizerRegistry:
+    # Tokenizer name -> (tokenizer module, tokenizer class)
+    REGISTRY: Dict[str, Tuple[str, str]] = {}
+
+    @staticmethod
+    def register(name: str, module: str, class_name: str) -> None:
+        TokenizerRegistry.REGISTRY[name] = (module, class_name)
+
+    @staticmethod
+    def get_tokenizer(
+        tokenizer_name: str,
+        *args,
+        **kwargs,
+    ) -> TokenizerBase:
+        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
+        if tokenizer_cls is None:
+            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+
+        tokenizer_module = importlib.import_module(tokenizer_cls[0])
+        class_ = getattr(tokenizer_module, tokenizer_cls[1])
+        return class_.from_pretrained(*args, **kwargs)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index f08923e7401f3..59131a9d7bfdd 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -10,6 +10,7 @@ import huggingface_hub
 from huggingface_hub import HfApi, hf_hub_download
 
 from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_base import TokenizerBase
 from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
@@ -140,7 +141,7 @@ def make_mistral_chat_completion_request(
                                  tools=tools)  # type: ignore[type-var]
 
 
-class MistralTokenizer:
+class MistralTokenizer(TokenizerBase):
 
     def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
         self.mistral = tokenizer
@@ -251,6 +252,14 @@ class MistralTokenizer:
     def eos_token_id(self) -> int:
         return self.tokenizer.eos_id
 
+    @property
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
     @property
     def is_fast(self) -> bool:
         return True
@@ -268,25 +277,26 @@ class MistralTokenizer:
 
     def __call__(
         self,
-        prompt: Union[str, List[str], List[int]],
+        text: Union[str, List[str], List[int]],
+        text_pair: Optional[str] = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
         max_length: Optional[int] = None,
     ):
         input_ids: Union[List[int], List[List[int]]]
         # For List[str], original prompt text
-        if is_list_of(prompt, str):
+        if is_list_of(text, str):
             input_ids_: List[List[int]] = []
-            for p in prompt:
+            for p in text:
                 each_input_ids = self.encode_one(p, truncation, max_length)
                 input_ids_.append(each_input_ids)
             input_ids = input_ids_
         # For List[int], apply chat template output, already tokens.
-        elif is_list_of(prompt, int):
-            input_ids = prompt
+        elif is_list_of(text, int):
+            input_ids = text
         # For str, single prompt text
         else:
-            input_ids = self.encode_one(prompt, truncation, max_length)
+            input_ids = self.encode_one(text, truncation, max_length)
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
@@ -300,22 +310,29 @@ class MistralTokenizer:
 
     def encode_one(
         self,
-        prompt: str,
+        text: str,
         truncation: bool = False,
         max_length: Optional[int] = None,
     ) -> List[int]:
         # Mistral Tokenizers should not add special tokens
-        input_ids = self.encode(prompt)
+        input_ids = self.encode(text)
 
         if truncation:
             input_ids = input_ids[:max_length]
         return input_ids
 
-    def encode(self, prompt: str) -> List[int]:
+    def encode(self,
+               text: str,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
         # `encode` should only be used for prompt completion
         # it should never be used for chat_completion.
         # For chat completion use `apply_chat_template`
-        return self.tokenizer.encode(prompt, bos=True, eos=False)
+        if add_special_tokens is not None:
+            return self.tokenizer.encode(text,
+                                         bos=add_special_tokens,
+                                         eos=add_special_tokens)
+        else:
+            return self.tokenizer.encode(text, bos=True, eos=False)
 
     def apply_chat_template(self,
                             messages: List["ChatCompletionMessageParam"],

From 974dfd497149e871e59e35b677a85cca66ec3bae Mon Sep 17 00:00:00 2001
From: Christian Pinto <chrpinto@gmail.com>
Date: Wed, 12 Feb 2025 04:34:30 +0000
Subject: [PATCH 27/43] [Model] IBM/NASA Prithvi Geospatial model  (#12830)

---
 .../prithvi_geospatial_mae.py                 | 530 ++++++++++++++++++
 tests/models/registry.py                      |   4 +
 vllm/attention/backends/placeholder_attn.py   |  11 +-
 vllm/inputs/preprocess.py                     |  22 +-
 .../models/prithvi_geospatial_mae.py          | 238 ++++++++
 vllm/model_executor/models/registry.py        |   4 +
 vllm/worker/pooling_model_runner.py           |  11 +-
 7 files changed, 811 insertions(+), 9 deletions(-)
 create mode 100644 examples/offline_inference/prithvi_geospatial_mae.py
 create mode 100644 vllm/model_executor/models/prithvi_geospatial_mae.py

diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
new file mode 100644
index 0000000000000..298f08019004d
--- /dev/null
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -0,0 +1,530 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This is a demo script showing how to use the
+PrithviGeospatialMAE model with vLLM
+This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa
+
+Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa
+
+The requirements for running this script are:
+- Installing [terratorch, albumentations, rasterio] in your python environment
+- downloading the model weights in a 'model' folder local to the script
+  (temporary measure until the proper config.json file is uploaded to HF)
+- download an input example image (India_900498_S2Hand.tif) and place it in
+  the same folder with the script (or specify with the --data_file argument)
+
+Run the example:
+python prithvi_geospatial_mae.py
+
+""" # noqa: E501
+import argparse
+import datetime
+import os
+import re
+from typing import List, Union
+
+import albumentations
+import numpy as np
+import rasterio
+import torch
+from einops import rearrange
+from terratorch.datamodules import Sen1Floods11NonGeoDataModule
+
+from vllm import LLM
+
+NO_DATA = -9999
+NO_DATA_FLOAT = 0.0001
+OFFSET = 0
+PERCENTILE = 99
+
+model_config = """{
+  "architectures": ["PrithviGeoSpatialMAE"],
+  "num_classes": 0,
+  "pretrained_cfg": {
+    "task_args": {
+      "task": "SemanticSegmentationTask",
+      "model_factory": "EncoderDecoderFactory",
+      "loss": "ce",
+      "ignore_index": -1,
+      "lr": 0.001,
+      "freeze_backbone": false,
+      "freeze_decoder": false,
+      "plot_on_val": 10,
+      "optimizer": "AdamW",
+      "scheduler": "CosineAnnealingLR"
+    },
+    "model_args": {
+      "backbone_pretrained": false,
+      "backbone": "prithvi_eo_v2_300_tl",
+      "decoder": "UperNetDecoder",
+      "decoder_channels": 256,
+      "decoder_scale_modules": true,
+      "num_classes": 2,
+      "rescale": true,
+      "backbone_bands": [
+        "BLUE",
+        "GREEN",
+        "RED",
+        "NIR_NARROW",
+        "SWIR_1",
+        "SWIR_2"
+      ],
+      "head_dropout": 0.1,
+      "necks": [
+        {
+          "name": "SelectIndices",
+          "indices": [
+            5,
+            11,
+            17,
+            23
+          ]
+        },
+        {
+          "name": "ReshapeTokensToImage"
+        }
+      ]
+    },
+    "optimizer_params" : {
+      "lr": 5.0e-05,
+      "betas": [0.9, 0.999],
+      "eps": [1.0e-08],
+      "weight_decay": 0.05,
+      "amsgrad": false,
+      "maximize": false,
+      "capturable": false,
+      "differentiable": false
+    },
+    "scheduler_params" : {
+        "T_max": 50,
+        "eta_min": 0,
+        "last_epoch": -1,
+        "verbose": "deprecated"
+    }
+  },
+
+
+  "torch_dtype": "float32"
+}
+"""
+
+# Temporarily creating the "config.json" for the model.
+# This is going to disappear once the correct config.json is available on HF
+with open(os.path.join(os.path.dirname(__file__), "./model/config.json"),
+          'w') as config_file:
+    config_file.write(model_config)
+
+datamodule_config = {
+    'bands': ['BLUE', 'GREEN', 'RED', 'NIR_NARROW', 'SWIR_1', 'SWIR_2'],
+    'batch_size':
+    16,
+    'constant_scale':
+    0.0001,
+    'data_root':
+    '/dccstor/geofm-finetuning/datasets/sen1floods11',
+    'drop_last':
+    True,
+    'no_data_replace':
+    0.0,
+    'no_label_replace':
+    -1,
+    'num_workers':
+    8,
+    'test_transform': [
+        albumentations.Resize(always_apply=False,
+                              height=448,
+                              interpolation=1,
+                              p=1,
+                              width=448),
+        albumentations.pytorch.ToTensorV2(transpose_mask=False,
+                                          always_apply=True,
+                                          p=1.0)
+    ],
+}
+
+
+class PrithviMAE:
+
+    def __init__(self):
+        print("Initializing PrithviMAE model")
+        self.model = LLM(model=os.path.join(os.path.dirname(__file__),
+                                            "./model"),
+                         skip_tokenizer_init=True,
+                         dtype="float32")
+
+    def run(self, input_data, location_coords):
+        print("################ Running inference on vLLM ##############")
+        # merge the inputs into one data structure
+        mm_data = {
+            "pixel_values":
+            torch.empty(0) if input_data is None else input_data,
+            "location_coords":
+            torch.empty(0) if location_coords is None else location_coords
+        }
+
+        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
+
+        outputs = self.model.encode(prompt, use_tqdm=False)
+        print(
+            "################ Inference done (it took seconds)  ##############"
+        )
+
+        return outputs[0].outputs.data
+
+
+def generate_datamodule():
+    datamodule = Sen1Floods11NonGeoDataModule(
+        data_root=datamodule_config['data_root'],
+        batch_size=datamodule_config["batch_size"],
+        num_workers=datamodule_config["num_workers"],
+        bands=datamodule_config["bands"],
+        drop_last=datamodule_config["drop_last"],
+        test_transform=datamodule_config["test_transform"
+                                         ""])
+
+    return datamodule
+
+
+def process_channel_group(orig_img, channels):
+    """
+    Args:
+        orig_img: torch.Tensor representing original image (reference)
+                  with shape = (bands, H, W).
+        channels: list of indices representing RGB channels.
+
+    Returns:
+        torch.Tensor with shape (num_channels, height, width) for original image
+    """
+
+    orig_img = orig_img[channels, ...]
+    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
+    valid_mask[orig_img == NO_DATA_FLOAT] = False
+
+    # Rescale (enhancing contrast)
+    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
+    min_value = OFFSET
+
+    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0,
+                           1)
+
+    # No data as zeros
+    orig_img[~valid_mask] = 0
+
+    return orig_img
+
+
+def read_geotiff(file_path: str):
+    """Read all bands from *file_path* and return image + meta info.
+
+    Args:
+        file_path: path to image file.
+
+    Returns:
+        np.ndarray with shape (bands, height, width)
+        meta info dict
+    """
+
+    with rasterio.open(file_path) as src:
+        img = src.read()
+        meta = src.meta
+        try:
+            coords = src.lnglat()
+        except Exception:
+            # Cannot read coords
+            coords = None
+
+    return img, meta, coords
+
+
+def save_geotiff(image, output_path: str, meta: dict):
+    """Save multi-band image in Geotiff file.
+
+    Args:
+        image: np.ndarray with shape (bands, height, width)
+        output_path: path where to save the image
+        meta: dict with meta info.
+    """
+
+    with rasterio.open(output_path, "w", **meta) as dest:
+        for i in range(image.shape[0]):
+            dest.write(image[i, :, :], i + 1)
+
+    return
+
+
+def _convert_np_uint8(float_image: torch.Tensor):
+    image = float_image.numpy() * 255.0
+    image = image.astype(dtype=np.uint8)
+
+    return image
+
+
+def load_example(
+    file_paths: List[str],
+    mean: List[float] = None,
+    std: List[float] = None,
+    indices: Union[list[int], None] = None,
+):
+    """Build an input example by loading images in *file_paths*.
+
+    Args:
+        file_paths: list of file paths .
+        mean: list containing mean values for each band in the images
+              in *file_paths*.
+        std: list containing std values for each band in the images
+             in *file_paths*.
+
+    Returns:
+        np.array containing created example
+        list of meta info for each image in *file_paths*
+    """
+
+    imgs = []
+    metas = []
+    temporal_coords = []
+    location_coords = []
+
+    for file in file_paths:
+        img, meta, coords = read_geotiff(file)
+
+        # Rescaling (don't normalize on nodata)
+        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
+        if indices is not None:
+            img = img[..., indices]
+        if mean is not None and std is not None:
+            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
+
+        imgs.append(img)
+        metas.append(meta)
+        if coords is not None:
+            location_coords.append(coords)
+
+        try:
+            match = re.search(r'(\d{7,8}T\d{6})', file)
+            if match:
+                year = int(match.group(1)[:4])
+                julian_day = match.group(1).split('T')[0][4:]
+                if len(julian_day) == 3:
+                    julian_day = int(julian_day)
+                else:
+                    julian_day = datetime.datetime.strptime(
+                        julian_day, '%m%d').timetuple().tm_yday
+                temporal_coords.append([year, julian_day])
+        except Exception as e:
+            print(f'Could not extract timestamp for {file} ({e})')
+
+    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
+    imgs = np.moveaxis(imgs, -1, 0).astype("float32")
+    imgs = np.expand_dims(imgs, axis=0)  # add batch di
+
+    return imgs, temporal_coords, location_coords, metas
+
+
+def run_model(input_data,
+              temporal_coords,
+              location_coords,
+              model,
+              datamodule,
+              img_size,
+              lightning_model=None):
+    # Reflect pad if not divisible by img_size
+    original_h, original_w = input_data.shape[-2:]
+    pad_h = (img_size - (original_h % img_size)) % img_size
+    pad_w = (img_size - (original_w % img_size)) % img_size
+    input_data = np.pad(input_data,
+                        ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
+                        mode="reflect")
+
+    # Build sliding window
+    batch_size = 1
+    batch = torch.tensor(input_data, device="cpu")
+    windows = (batch.unfold(3, img_size,
+                            img_size).unfold(4, img_size, img_size))
+    h1, w1 = windows.shape[3:5]
+    windows = rearrange(windows,
+                        "b c t h1 w1 h w -> (b h1 w1) c t h w",
+                        h=img_size,
+                        w=img_size)
+
+    # Split into batches if number of windows > batch_size
+    num_batches = windows.shape[0] // batch_size if windows.shape[
+        0] > batch_size else 1
+    windows = torch.tensor_split(windows, num_batches, dim=0)
+
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+
+    if temporal_coords:
+        temporal_coords = torch.tensor(temporal_coords,
+                                       device=device).unsqueeze(0)
+    else:
+        temporal_coords = None
+    if location_coords:
+        location_coords = torch.tensor(location_coords[0],
+                                       device=device).unsqueeze(0)
+    else:
+        location_coords = None
+
+    # Run model
+    pred_imgs = []
+    for x in windows:
+        # Apply standardization
+        x = datamodule.test_transform(
+            image=x.squeeze().numpy().transpose(1, 2, 0))
+        x = datamodule.aug(x)['image']
+
+        with torch.no_grad():
+            x = x.to(device)
+            pred = model.run(x, location_coords=location_coords)
+            if lightning_model:
+                pred_lightning = lightning_model(
+                    x,
+                    temporal_coords=temporal_coords,
+                    location_coords=location_coords)
+                pred_lightning = pred_lightning.output.detach().cpu()
+                if not torch.equal(pred, pred_lightning):
+                    print("Inference output is not equal")
+        y_hat = pred.argmax(dim=1)
+
+        y_hat = torch.nn.functional.interpolate(y_hat.unsqueeze(1).float(),
+                                                size=img_size,
+                                                mode="nearest")
+
+        pred_imgs.append(y_hat)
+
+    pred_imgs = torch.concat(pred_imgs, dim=0)
+
+    # Build images from patches
+    pred_imgs = rearrange(
+        pred_imgs,
+        "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
+        h=img_size,
+        w=img_size,
+        b=1,
+        c=1,
+        h1=h1,
+        w1=w1,
+    )
+
+    # Cut padded area back to original size
+    pred_imgs = pred_imgs[..., :original_h, :original_w]
+
+    # Squeeze (batch size 1)
+    pred_imgs = pred_imgs[0]
+
+    return pred_imgs
+
+
+def main(
+    data_file: str,
+    output_dir: str,
+    rgb_outputs: bool,
+    input_indices: list[int] = None,
+):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Load model ---------------------------------------------------------------
+
+    model_obj = PrithviMAE()
+    datamodule = generate_datamodule()
+    img_size = 256  # Size of Sen1Floods11
+
+    # Loading data -------------------------------------------------------------
+
+    input_data, temporal_coords, location_coords, meta_data = load_example(
+        file_paths=[data_file],
+        indices=input_indices,
+    )
+
+    meta_data = meta_data[0]  # only one image
+
+    if input_data.mean() > 1:
+        input_data = input_data / 10000  # Convert to range 0-1
+
+    # Running model ------------------------------------------------------------
+
+    channels = [
+        datamodule_config['bands'].index(b) for b in ["RED", "GREEN", "BLUE"]
+    ]  # BGR -> RGB
+
+    pred = run_model(input_data, temporal_coords, location_coords, model_obj,
+                     datamodule, img_size)
+
+    # Save pred
+    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
+    pred_file = os.path.join(
+        output_dir,
+        f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+    save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)
+
+    # Save image + pred
+    meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)
+
+    if input_data.mean() < 1:
+        input_data = input_data * 10000  # Scale to 0-10000
+
+    rgb_orig = process_channel_group(
+        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
+        channels=channels,
+    )
+
+    pred[pred == 0.] = np.nan
+    img_pred = rgb_orig * 0.7 + pred * 0.3
+    img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]
+
+    img_pred_file = os.path.join(
+        output_dir,
+        f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+    save_geotiff(
+        image=_convert_np_uint8(img_pred),
+        output_path=img_pred_file,
+        meta=meta_data,
+    )
+
+    # Save image rgb
+    if rgb_outputs:
+        rgb_file = os.path.join(
+            output_dir, "original_rgb_"
+            f"{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+        save_geotiff(
+            image=_convert_np_uint8(rgb_orig),
+            output_path=rgb_file,
+            meta=meta_data,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        default="./India_900498_S2Hand.tif",
+        help="Path to the file.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=[1, 2, 3, 8, 11, 12],
+        type=int,
+        nargs="+",
+        help=
+        "0-based indices of the six Prithvi channels to be selected from the  "
+        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
+    args = parser.parse_args()
+
+    main(**vars(args))
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 66b7d3c2e77b5..7b1db55494fe4 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -214,6 +214,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
                                          trust_remote_code=True),
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
+    # The model on Huggingface is currently being updated,
+    # hence I temporarily mark it as not available online
+    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",  # noqa: E501
+                                            is_available_online=False),
 }
 
 _CROSS_ENCODER_EXAMPLE_MODELS = {
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index f363ba0c1e30c..f1def25c89cff 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -320,9 +320,14 @@ class PlaceholderAttentionMetadataBuilder(
                                  -1 if cuda graph is not used.
             batch_size: The maybe padded batch size.
         """
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled)
+
+        # Some input builders such as ModelInputForCPUBuilder do not have the
+        # "inter_data_list" attribute.
+        # Let's check inter_data_list exists before we reference it.
+        if hasattr(self.input_builder, "inter_data_list"):
+            for inter_data in self.input_builder.inter_data_list:
+                self._add_seq_group(inter_data,
+                                    self.input_builder.chunked_prefill_enabled)
 
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 53f89996f0fe9..656f2f2b766e0 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -254,8 +254,14 @@ class InputPreprocessor:
         Apply the model's multi-modal processor to a multi-modal prompt,
         returning the corresponding token IDs and metadata.
         """
-        tokenizer_group = self.get_tokenizer_group()
-        tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
+        # At the moment on model (PrithviGeoSpatialMAE) requires to be
+        # initialized without a tokenizer while using also multi-modal
+        # input.
+        if not self.tokenizer:
+            tokenizer = None
+        else:
+            tokenizer_group = self.get_tokenizer_group()
+            tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
 
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
@@ -273,9 +279,15 @@ class InputPreprocessor:
         lora_request: Optional[LoRARequest],
     ) -> MultiModalInputs:
         """Async version of :meth:`_process_multimodal`."""
-        tokenizer_group = self.get_tokenizer_group()
-        tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
-                                                                   )
+        # At the moment on model (PrithviGeoSpatialMAE) requires to be
+        # initialized without a tokenizer while using also multi-modal
+        # input.
+        if not self.tokenizer:
+            tokenizer = None
+        else:
+            tokenizer_group = self.get_tokenizer_group()
+            tokenizer = await tokenizer_group.get_lora_tokenizer_async(
+                lora_request)
 
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
new file mode 100644
index 0000000000000..9383cbae11bce
--- /dev/null
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 IBM.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only IBM/NASA Prithvi Geospatial model."""
+from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (IsAttentionFree,
+                                                   SupportsMultiModal)
+from vllm.model_executor.models.utils import AutoWeightsLoader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import (IntermediateTensors, PoolerOutput,
+                           PoolingSequenceGroupOutput)
+
+
+class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        pass
+
+
+class PrithviGeoSpatialMAEInputBuilder(
+        BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        return ProcessorInputs(
+            prompt_text="",
+            # This model input is fixed and is in the form of a torch Tensor.
+            # The size of pixel_values might change in the cases where we resize
+            # the input but never exceeds the dimensions below.
+            mm_data={
+                "pixel_values": torch.full((1, 6, 512, 512), 1.0),
+                "location_coords": torch.full((1, 2), 1.0)
+            })
+
+
+class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            location_coords=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        pass
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        pass
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        mm_kwargs = {}
+
+        for k, v in mm_data.items():
+            mm_kwargs[k] = v
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=[1],
+            mm_kwargs=MultiModalKwargs(mm_kwargs),
+            mm_placeholders={},
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    PrithviGeoSpatialMAEMultiModalProcessor,
+    info=PrithviGeoSpatialMAEProcessingInfo,
+    dummy_inputs=PrithviGeoSpatialMAEInputBuilder)
+class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
+    """ Prithvi Masked Autoencoder"""
+
+    def _instantiate_model(self, config: dict) -> nn.Module | None:
+
+        # We might be able/need to support different tasks with this same model
+        if config["task_args"]["task"] == "SemanticSegmentationTask":
+            from terratorch.cli_tools import SemanticSegmentationTask
+            task = SemanticSegmentationTask(
+                config["model_args"],
+                config["task_args"]["model_factory"],
+                loss=config["task_args"]["loss"],
+                lr=config["task_args"]["lr"],
+                ignore_index=config["task_args"]["ignore_index"],
+                optimizer=config["task_args"]["optimizer"],
+                optimizer_hparams=config["optimizer_params"],
+                scheduler=config["task_args"]["scheduler"],
+                scheduler_hparams=config["scheduler_params"],
+                plot_on_val=config["task_args"]["plot_on_val"],
+                freeze_decoder=config["task_args"]["freeze_decoder"],
+                freeze_backbone=config["task_args"]["freeze_backbone"])
+
+            return task.model
+        else:
+            return None
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        # the actual model is dynamically instantiated using terratorch
+        # allowing us to perform changes to the model architecture
+        # at startup time (e.g., change the model decoder class.)
+        self.model = self._instantiate_model(
+            vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
+        if self.model is None:
+            raise ValueError(
+                "Unsupported task."
+                "Only SemanticSegmentationTask is supported for now"
+                "by PrithviGeospatialMAE.")
+
+    def _parse_and_validate_multimodal_data(
+            self, **kwargs) -> Tuple[torch.Tensor, torch.Tensor | None]:
+
+        pixel_values = kwargs.pop("pixel_values", None)
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError(f"Incorrect type of pixel_values. "
+                             f"Got type: {type(pixel_values)}")
+        pixel_values = torch.unbind(pixel_values, dim=0)[0]
+
+        location_coords = kwargs.pop("location_coords", None)
+        if not isinstance(location_coords, torch.Tensor):
+            raise ValueError(f"Incorrect type of location_coords. "
+                             f"Got type: {type(location_coords)}")
+        location_coords = torch.unbind(location_coords, dim=0)[0]
+        if location_coords.shape == torch.Size([0]):
+            location_coords = None
+
+        return pixel_values, location_coords
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ):
+
+        pixel_values, location_coords = (
+            self._parse_and_validate_multimodal_data(**kwargs))
+        model_output = self.model(pixel_values,
+                                  location_coords=location_coords)
+
+        return model_output.output
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return PoolerOutput([PoolingSequenceGroupOutput(hidden_states)])
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_list = []
+        model_buffers = dict(self.named_buffers())
+        loaded_buffers = []
+        for key, value in weights:
+            if key == "state_dict":
+                weights_to_parse = value
+                for name, weight in weights_to_parse.items():
+                    if "pos_embed" in name:
+                        continue
+
+                    if "_timm_module." in name:
+                        name = name.replace("_timm_module.", "")
+
+                    # this model requires a couple of buffers to be loaded
+                    # that are not loadable with the AutoWeightsLoader
+                    if name in model_buffers:
+                        if "_timm_module." in name:
+                            name = name.replace("_timm_module.", "")
+                        buffer = model_buffers[name]
+                        weight_loader = getattr(buffer, "weight_loader",
+                                                default_weight_loader)
+                        weight_loader(buffer, weight)
+                        loaded_buffers.append(name)
+                    else:
+                        params_list.append((name, weight))
+                break
+
+        # Load the remaining model parameters
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(params_list)
+
+        return autoloaded_weights.union(set(loaded_buffers))
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c2d0fae7056c7..ebf6a88f21b8e 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -137,6 +137,10 @@ _EMBEDDING_MODELS = {
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
     # [Auto-converted (see adapters.py)]
     "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
+    # Technically PrithviGeoSpatialMAE is a model that works on images, both in
+    # input and output. I am adding it here because it piggy-backs on embedding
+    # models for the time being.
+    "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
 }
 
 _CROSS_ENCODER_MODELS = {
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index f43085b0e969a..4cbe5db44534a 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -74,7 +74,16 @@ class PoolingModelRunner(
         prefill_meta = model_input.attn_metadata.prefill_metadata
         decode_meta = model_input.attn_metadata.decode_metadata
         virtual_engine = model_input.virtual_engine
-        if prefill_meta is None and decode_meta.use_cuda_graph:
+        # Pooling models are (ab-)used also to integrate non text models that
+        # are not autoregressive (PrithviGeosaptialMAE).
+        # These model might not use attention and do not really have a prefill
+        # and decode phase. The model input is processed in one shot and both
+        # decode_metadata and prefill_metadata would be None for such models.
+        # See the PlaceholderAttentionMetadata class.
+        # TODO: Figure out if cuda_graph is of any use for these models and
+        #  explore how to leverage it.
+        if (prefill_meta is None and decode_meta is not None
+                and decode_meta.use_cuda_graph):
             assert model_input.input_tokens is not None
             graph_batch_size = model_input.input_tokens.shape[0]
             model_executable = self.graph_runners[virtual_engine][

From 842b0fd402574f49f8828fcff1b8dacc3bcab5fa Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 11 Feb 2025 20:38:10 -0800
Subject: [PATCH 28/43] [ci] Add more source file dependencies for some tests
 (#13123)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
---
 .buildkite/test-pipeline.yaml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 948eab97ffae2..e26b1bf3818ef 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -107,6 +107,10 @@ steps:
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  - tests/entrypoints/offline_mode
   commands:
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
@@ -124,9 +128,10 @@ steps:
   source_file_dependencies:
   - vllm/distributed/
   - vllm/core/
-  - tests/distributed
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
   - tests/spec_decode/e2e/test_integration_dist_tp4
-  - tests/compile
+  - tests/compile/test_basic_correctness
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   commands:
@@ -174,6 +179,9 @@ steps:
   - vllm/
   - tests/engine
   - tests/tokenization
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py
   # OOM in the CI unless we run this separately

From e92694b6fe264a85371317295bca6643508034ef Mon Sep 17 00:00:00 2001
From: Lingfan Yu <lingfany@amazon.com>
Date: Tue, 11 Feb 2025 21:12:37 -0800
Subject: [PATCH 29/43] [Neuron][Kernel] Support Longer Sequences in NKI-based
 Flash PagedAttention and Improve Efficiency (#12921)

Signed-off-by: Lingfan Yu <lingfany@amazon.com>
---
 tests/neuron/test_prefix_prefill.py  | 118 ++++++++-------
 vllm/attention/ops/nki_flash_attn.py | 212 +++++++++++----------------
 2 files changed, 152 insertions(+), 178 deletions(-)

diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
index dfbcfc15e2327..04d1bd3f0eb04 100644
--- a/tests/neuron/test_prefix_prefill.py
+++ b/tests/neuron/test_prefix_prefill.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import random
 from typing import Optional
 
 import pytest
@@ -171,12 +170,22 @@ def ref_context_attention(
         return output
 
 
+@pytest.mark.parametrize(
+    "block_size, large_tile_size",
+    [
+        (32, 2048),  # 64 blocks
+        (32, 4096),  # 128 blocks
+        (32, 8192),  # 256 blocks
+        (64, 8192),  # 128 blocks
+    ],
+)
 @pytest.mark.parametrize(
     "num_heads,num_queries_per_kv,head_size,mixed_precision",
     [
         (4, 2, 8, False),
         (4, 2, 8, True),
         (32, 8, 64, True),
+        (16, 2, 128, True),
     ],
 )
 @torch.inference_mode()
@@ -184,6 +193,8 @@ def test_contexted_kv_attention(
     num_heads: int,
     num_queries_per_kv: int,
     head_size: int,
+    block_size: int,
+    large_tile_size,
     mixed_precision: bool,
 ) -> None:
     import os
@@ -192,40 +203,46 @@ def test_contexted_kv_attention(
 
     from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
 
+    assert large_tile_size % block_size == 0
+
     device = xm.xla_device()
 
-    os.environ["NEURON_CC_FLAGS"] = (
-        " --model-type=transformer -O1 "
-        " --internal-hlo2tensorizer-options='--verify-hlo' ")
+    compiler_flags = [
+        "--model-type=transformer -O1",
+        "--internal-hlo2tensorizer-options='--verify-hlo'",
+        "--retry_failed_compilation",
+    ]
+    compiler_flags_str = " ".join(compiler_flags)
+    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
 
-    random.seed(0)
     torch.manual_seed(0)
     torch.set_printoptions(sci_mode=False)
 
-    min_ctx_len = 2
-    max_ctx_len = 64
-    min_query_len = 2
-    max_query_len = 64
-    prefill_batch_size = 2
-    decode_batch_size = 6
+    min_ctx_len = 32
+    max_ctx_len = 1024
+    min_query_len = 16
+    max_query_len = 512
+    prefill_batch_size = 4
+    decode_batch_size = 12
     batch_size = prefill_batch_size + decode_batch_size
-    block_size = 32
     max_model_len = (max_query_len + max_ctx_len) * 4
 
     max_block_per_request = max_model_len // block_size
     dtype = torch.float32
     cache_size = (batch_size * max_block_per_request) + 2
-    ctx_lens = [
-        random.randint(min_ctx_len, max_ctx_len)
-        for _ in range(prefill_batch_size)
-    ] + [
-        random.randint(min_ctx_len, max_ctx_len)
-        for _ in range(decode_batch_size)
-    ]
-    query_lens = [
-        random.randint(min_query_len, max_query_len)
-        for _ in range(prefill_batch_size)
-    ] + [1 for _ in range(decode_batch_size)]
+    prefill_ctx_lens = torch.randint(min_ctx_len,
+                                     max_ctx_len + 1, (prefill_batch_size, ),
+                                     dtype=torch.long).tolist()
+    decode_ctx_lens = torch.randint(min_ctx_len,
+                                    max_ctx_len + 1, (decode_batch_size, ),
+                                    dtype=torch.long).tolist()
+    ctx_lens = prefill_ctx_lens + decode_ctx_lens
+    query_lens = torch.randint(
+        min_query_len,
+        max_query_len + 1,
+        (prefill_batch_size, ),
+        dtype=torch.long,
+    ).tolist() + [1 for _ in range(decode_batch_size)]
     seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
     num_kv_heads = num_heads // num_queries_per_kv
 
@@ -254,7 +271,6 @@ def test_contexted_kv_attention(
     values = values[torch.randperm(cache_size)]
     block_table = values[:batch_size * max_block_per_request].view(
         batch_size, max_block_per_request)
-    torch.tensor(seq_lens, dtype=torch.long)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
     b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
                                             dtype=torch.long),
@@ -311,9 +327,7 @@ def test_contexted_kv_attention(
     # build neuron program
     return_debug_tensors = False
     B_P_SIZE = 128
-    LARGE_TILE_SZ = 2048
-    max_num_queries = (
-        (sum(query_lens) + block_size - 1) // block_size) * block_size
+    LARGE_TILE_SZ = large_tile_size
 
     def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
                                 num_blocks):
@@ -332,26 +346,28 @@ def test_contexted_kv_attention(
             0,
         )
 
-    def shift_bit_length(x):
-        return 1 << (x - 1).bit_length()
+    def ceil_div(a, b):
+        return (a + b - 1) // b
+
+    def pad_to_multiple(a, b):
+        return ceil_div(a, b) * b
+
+    def pad_to_next_power_of_2(a):
+        assert a > 0
+        return 2**int(a - 1).bit_length()
 
     # calculate input shapes
-    max_num_queries_shifted = shift_bit_length(max_num_queries)
-    max_num_queries_factor = B_P_SIZE // max_num_queries_shifted
-    max_num_queries_padded = max_num_queries_shifted * max_num_queries_factor
-    assert (max_num_queries_padded == B_P_SIZE
-            ), "invalid {max_num_queries_padded=}"
+    max_num_queries = pad_to_multiple(sum(query_lens), block_size)
+    max_num_queries = pad_to_next_power_of_2(max_num_queries)
     head_size_padded = B_P_SIZE
+    assert head_size_padded >= head_size
     context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-    num_active_blocks_shifted = shift_bit_length(
-        ((context_lens + block_size - 1) // block_size).sum().item())
-    num_active_blocks_factor = (LARGE_TILE_SZ // block_size //
-                                num_active_blocks_shifted)
-    num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
-    assert (num_active_blocks *
-            block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}"
+    num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+    num_active_blocks = pad_to_multiple(num_active_blocks,
+                                        LARGE_TILE_SZ // block_size)
     context_kv_len = num_active_blocks * block_size
-    assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}"
+    assert (context_kv_len %
+            LARGE_TILE_SZ == 0), f"invalid context_kv_len={context_kv_len}"
 
     # pad QKV tensors
     pad_dims = (
@@ -360,7 +376,7 @@ def test_contexted_kv_attention(
         0,
         0,
         0,
-        max_num_queries_padded - query.shape[0],
+        max_num_queries - query.shape[0],
     )
     query = F.pad(query, pad_dims, "constant", 0)
     k = F.pad(k, pad_dims, "constant", 0)
@@ -397,7 +413,7 @@ def test_contexted_kv_attention(
                     0,
                     context_kv_len - prior_mask.shape[1],
                     0,
-                    B_P_SIZE - prior_mask.shape[0],
+                    max_num_queries - prior_mask.shape[0],
                 ),
                 "constant",
                 0,
@@ -406,9 +422,9 @@ def test_contexted_kv_attention(
                 active_mask,
                 (
                     0,
-                    B_P_SIZE - active_mask.shape[1],
+                    max_num_queries - active_mask.shape[1],
                     0,
-                    B_P_SIZE - active_mask.shape[0],
+                    max_num_queries - active_mask.shape[0],
                 ),
                 "constant",
                 0,
@@ -430,6 +446,8 @@ def test_contexted_kv_attention(
         n_kv_head=num_kv_heads,
         head_size=head_size,
         mixed_precision=mixed_precision,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
+        return_debug_tensors=return_debug_tensors,
     )
 
     if return_debug_tensors:
@@ -439,17 +457,15 @@ def test_contexted_kv_attention(
         output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
         debug_tensors = []
 
-    output_nki = torch.tensor(output_nki).cpu()
     debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors]
 
     num_actual_tokens = sum(query_lens)
-    print(f"{num_actual_tokens=}")
     # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-    output_nki = output_nki.permute(
-        0, 2, 1, 3)[:, :, :, :head_size].cpu()[0, :num_actual_tokens, :, :]
+    output_nki = output_nki.cpu().permute(0, 2, 1, 3)[:, :, :, :head_size]
+    output_nki = output_nki[0, :num_actual_tokens, :, :]
     output_ref_padded = F.pad(
         output_ref,
-        (0, 0, 0, 0, 0, 0, 0, max_num_queries_padded - output_ref.shape[0]),
+        (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
         "constant",
         0,
     )
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 68aa63f5ac16c..5e2a1f7e66d1f 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -28,7 +28,6 @@ class FlashConfig:
 def transpose_p_local(p_local_transposed,
                       p_local,
                       LARGE_TILE_SZ,
-                      forward_mask,
                       B_F_SIZE=512):
     for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
         if nisa.get_nc_version() == nisa.nc_version.gen3:
@@ -46,13 +45,13 @@ def transpose_p_local(p_local_transposed,
 
             if nisa.get_nc_version() == nisa.nc_version.gen3:
                 p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
-                    p_local[:, i_j_128_slice], mask=forward_mask)
+                    p_local[:, i_j_128_slice])
             else:
                 p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
-                    p_local[:, i_j_128_slice], mask=forward_mask)
+                    p_local[:, i_j_128_slice])
 
         p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
-            p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask)
+            p_local_t_tmp, dtype=p_local_transposed.dtype)
 
 
 @nki.jit
@@ -60,36 +59,25 @@ def _flash_attention_core(
     q_local_tile,
     k,
     v,
-    q_h_per_k_h,
-    seqlen_q,
-    nheads,
     o_buffer,
     l_buffer,
     m_buffer,
-    batch_id,
-    head_id,
-    gqa_head_idx,
     q_tile_idx,
-    local_k_large_tile_idx,
     kernel_dtype,
     acc_type,
     flash_config: FlashConfig,
-    use_causal_mask=False,
-    continuous_batching_mask=None,
+    use_causal_mask,
+    tile_mask,
     initialize=False,
     B_P_SIZE=128,
     B_F_SIZE=512,
     B_D_SIZE=128,
-    dropout_p=0.0,
-    dropout_p_tensor=None,
-    seed_tensor=None,
-    logit_bias_tile=None,
     qk_res_buffer=None,
 ):
     """
     The flash attention core function to calculate self attention between a tile
     of q and a block of K and V.
-    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF 
+    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF
     already. The block size of K and V
     is defined in the seq_tile_size of the flash_config. The results are stored
     in the following three buffers
@@ -99,24 +87,9 @@ def _flash_attention_core(
     """
     LARGE_TILE_SZ = flash_config.seq_tile_size
     num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
-    seqlen_k = k.shape[-1]
-    seqlen_q // B_P_SIZE
-    seqlen_k // B_F_SIZE
-
-    # TODO : support logit_bias with continuous_batching_mask
-    assert not use_causal_mask, "causal mask is not supported."
-    assert (continuous_batching_mask
-            is not None), "continuous_batching_mask input is required."
-    if continuous_batching_mask is not None:
-        assert (
-            logit_bias_tile
-            is None), "continuous_batching_mask does not support logit_bias!"
 
     # mask are used to only apply computation to the lower half of the matrix,
     # which reduce the arithmetic intensity by half
-    forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx *
-                    LARGE_TILE_SZ if use_causal_mask else None)
-
     qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                             buffer=nl.sbuf,
                             dtype=acc_type)
@@ -125,20 +98,27 @@ def _flash_attention_core(
     for k_i in nl.affine_range(num_k_tile_per_large_tile):
         k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
 
-        qk_psum = nl.zeros((par_dim(B_P_SIZE), B_F_SIZE),
-                           dtype=np.float32,
-                           buffer=nl.psum)  # (128, 512)
-        qk_psum[:, :] = nl.matmul(q_local_tile,
-                                  k[:, k_i_b_f_slice],
-                                  transpose_x=True,
-                                  mask=None)  # (p(128), 512)
+        if use_causal_mask:
+            multiplication_required_selection = (q_tile_idx * B_P_SIZE
+                                                 >= k_i * B_F_SIZE)
+        else:
+            multiplication_required_selection = True
 
-        qk_res_buf[:, k_i_b_f_slice] = nl.where(
-            continuous_batching_mask[:, k_i_b_f_slice],
-            qk_psum[:, nl.ds(0, B_F_SIZE)],
-            -9984.0,
-            dtype=acc_type,
-        )
+        if multiplication_required_selection:
+            qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE),
+                                 dtype=np.float32,
+                                 buffer=nl.psum)  # (128, 512)
+            qk_psum[:, :] = nl.matmul(q_local_tile,
+                                      k[:, k_i_b_f_slice],
+                                      transpose_x=True)  # (p(128), 512)
+            qk_res_buf[:, k_i_b_f_slice] = nl.where(
+                tile_mask[:, k_i_b_f_slice],
+                qk_psum[:, nl.ds(0, B_F_SIZE)],
+                -9984.0,
+                dtype=acc_type,
+            )
+        else:
+            qk_res_buf[:, k_i_b_f_slice] = -9984.0
 
         # Calculate max of the current tile
         max_local[:, k_i] = nisa.tensor_reduce(
@@ -147,7 +127,6 @@ def _flash_attention_core(
             axis=(1, ),
             dtype=acc_type,
             negate=False,
-            mask=forward_mask,
         )
 
     if qk_res_buffer is not None:
@@ -159,7 +138,6 @@ def _flash_attention_core(
         axis=(1, ),
         dtype=acc_type,
         negate=False,
-        mask=forward_mask,
     )
 
     o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
@@ -170,8 +148,7 @@ def _flash_attention_core(
         m_current = max_
     else:
         m_previous = nl.copy(m_buffer[:, 0])
-        m_buffer[:, 0] = nl.maximum(m_previous, max_,
-                                    mask=forward_mask)  # (128,1)
+        m_buffer[:, 0] = nl.maximum(m_previous, max_)  # (128,1)
 
         m_current = m_buffer[:, 0]
         # Compute scaling factor
@@ -180,11 +157,8 @@ def _flash_attention_core(
             m_previous,
             bias=-1 * m_current,
             scale=1.0,
-            mask=forward_mask,
         )
-        o_previous_scaled[...] = nl.multiply(o_buffer[:, :],
-                                             alpha,
-                                             mask=forward_mask)
+        o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
 
     p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                          dtype=kernel_dtype)
@@ -207,10 +181,9 @@ def _flash_attention_core(
             reduce_op=nl.add,
             reduce_res=p_partial_sum[:, k_r_i],
             dtype=kernel_dtype,
-            mask=forward_mask,
         )
 
-    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type, mask=forward_mask)
+    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
 
     p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                                     dtype=kernel_dtype)
@@ -218,7 +191,6 @@ def _flash_attention_core(
         p_local_transposed=p_local_transposed,
         p_local=p_local,
         LARGE_TILE_SZ=LARGE_TILE_SZ,
-        forward_mask=forward_mask,
         B_F_SIZE=B_F_SIZE,
     )
 
@@ -230,27 +202,20 @@ def _flash_attention_core(
             p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
             v[k_i, :, :],
             transpose_x=True,
-            mask=forward_mask,
         )  # (128, 128) (p(Br), d)
 
     if initialize:
         o_buffer[:, :] = nl.copy(pv_psum[:, :])
         l_buffer[:, 0] = nl.add(nl.log(ps), max_)
     else:
-        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum, mask=forward_mask)
+        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
 
         l_prev = l_buffer[:, 0]
         l_exp = nl.add(
-            nl.exp(
-                nl.subtract(l_prev, m_current, mask=forward_mask),
-                mask=forward_mask,
-            ),
+            nl.exp(nl.subtract(l_prev, m_current)),
             ps,
-            mask=forward_mask,
         )
-        l_buffer[:, 0] = nl.add(m_current,
-                                nl.log(l_exp, mask=forward_mask),
-                                mask=forward_mask)
+        l_buffer[:, 0] = nl.add(m_current, nl.log(l_exp))
 
 
 @nki.jit
@@ -279,6 +244,21 @@ def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
     )
 
 
+@nki.jit
+def load_block_tables(block_tables_hbm, num_tiles):
+    (num_blocks, ) = block_tables_hbm.shape
+    assert num_blocks % num_tiles == 0
+    num_blocks_per_tile = num_blocks // num_tiles
+    block_tables_hbm = block_tables_hbm.reshape(
+        (num_tiles, num_blocks_per_tile))
+    block_tables_buffer = nl.load(block_tables_hbm, dtype=nl.int32)
+    return block_tables_buffer
+
+
+def is_power_of_2(x):
+    return x > 0 and (x & (x - 1)) == 0
+
+
 @nki.jit
 def flash_paged_attention(
     query,
@@ -316,24 +296,24 @@ def flash_paged_attention(
       - We use paged cache blocks (key_cache, value_cache) to store KV cache.
 
     IO tensor dtypes:
-      - This kernel assumes all IO tensors have the same dtype except for 
+      - This kernel assumes all IO tensors have the same dtype except for
         block_tables (int32) and mask (int32)
-      - If mixed_percision is True, then all Tensor Engine operation will be 
-        performed in bfloat16 and accumulation will be performed in float32. 
+      - If mixed_percision is True, then all Tensor Engine operation will be
+        performed in bfloat16 and accumulation will be performed in float32.
         Otherwise the intermediates will be in the same type as the inputs.
 
     Compile-time Constants:
       - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
       - mixed_precision: flag to set non-matmul ops in fp32 precision, default
-        is set to `true`, if false, we use same precision as input types 
+        is set to `true`, if false, we use same precision as input types
       - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig`
           with Performance config parameters for flash attention with default
           values
-        seq_tile_size: `default=2048`, size of the kv tile size for attention 
+        seq_tile_size: `default=2048`, size of the kv tile size for attention
           computation reduction
 
     GQA support Notes:
-      the spmd kernel for launching kernel should be on kv_heads instead of 
+      the spmd kernel for launching kernel should be on kv_heads instead of
       nheads
 
     Example usage:
@@ -415,18 +395,13 @@ def flash_paged_attention(
             ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}"
     num_large_k_tile = context_kv_len // LARGE_TILE_SZ
     num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
-    assert (num_blocks_per_large_tile <= B_P_SIZE
-    ), f"The number of blocks in each large tile " \
-    f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}"
+    assert block_size % 32 == 0, "block_size is expected to be a multiple of 32"
+    assert is_power_of_2(
+        num_blocks_per_large_tile
+    ), "The number of blocks in each large tile is expected of be power of 2"
+    assert is_power_of_2(seqlen_q), "seqlen_q is expected to be power of 2"
 
-    block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile),
-                                0,
-                                dtype=np.int32,
-                                buffer=nl.sbuf)
-    for j in nl.affine_range(num_large_k_tile):
-        i_p = nl.arange(num_blocks_per_large_tile)[:, None]
-        block_tables_sbuf[i_p, j] = nl.load(
-            block_tables[j * num_blocks_per_large_tile + i_p], dtype=np.int32)
+    block_tables_sbuf = load_block_tables(block_tables, num_large_k_tile)
 
     # Global Flash Attention accumulators
     o_buffer = nl.zeros(
@@ -457,7 +432,7 @@ def flash_paged_attention(
         )
 
         for k_i in nl.affine_range(num_blocks_per_large_tile):
-            loaded = nl.load(key_cache[block_tables_sbuf[k_i, j], :,
+            loaded = nl.load(key_cache[block_tables_sbuf[j, k_i], :,
                                        head_id, :])
             cur_k_tile[:, nl.ds(k_i *
                                 block_size, block_size)] = nl.transpose(loaded)
@@ -469,7 +444,7 @@ def flash_paged_attention(
                     num_blocks_per_partition):
                 v_i = (partition_idx * num_blocks_per_partition +
                        block_in_partition)
-                loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :,
+                loaded_v = nl.load(value_cache[block_tables_sbuf[j, v_i], :,
                                                head_id, :])
                 cur_v_tile[
                     partition_idx,
@@ -477,14 +452,15 @@ def flash_paged_attention(
                     :,
                 ] = loaded_v
 
-        cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                              dtype=mask.dtype)
-        for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
-            cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(
-                mask[:, nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE)])
-
-        for i_q_h in nl.affine_range(q_h_per_k_h):
-            for i in nl.affine_range(n_tile_q):
+        for i in nl.affine_range(n_tile_q):
+            cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                                  dtype=mask.dtype)
+            for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+                cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(mask[
+                    nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE),
+                ])
+            for i_q_h in nl.affine_range(q_h_per_k_h):
                 q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
                 q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
                 q_sbuf_tile = nl.load(
@@ -497,35 +473,24 @@ def flash_paged_attention(
                     q_local_tile=q_tile,
                     k=cur_k_tile,
                     v=cur_v_tile,
-                    q_h_per_k_h=q_h_per_k_h,
-                    seqlen_q=seqlen_q,
-                    nheads=h,
                     o_buffer=o_buffer[i, i_q_h],
                     l_buffer=l_buffer[:, i, i_q_h],
                     m_buffer=m_buffer[i, i_q_h],
-                    batch_id=batch_id,
-                    head_id=head_id,
-                    gqa_head_idx=i_q_h,
                     q_tile_idx=i,
-                    local_k_large_tile_idx=j,
                     kernel_dtype=kernel_dtype,
                     acc_type=acc_type,
                     flash_config=config,
                     use_causal_mask=False,
-                    continuous_batching_mask=cur_mask,
+                    tile_mask=cur_mask,
                     initialize=j == 0,
                     B_P_SIZE=B_P_SIZE,
                     B_F_SIZE=B_F_SIZE,
                     B_D_SIZE=B_D_SIZE,
-                    dropout_p=0.0,
-                    dropout_p_tensor=None,
-                    seed_tensor=None,
-                    logit_bias_tile=None,
                 )
 
     # compute attention between input query, key and value
     if key is not None and value is not None:
-        B_F_SIZE = seqlen_q
+        B_F_SIZE = min(seqlen_q, B_F_SIZE)
         LARGE_TILE_SZ = seqlen_q
         active_config = FlashConfig(
             seq_tile_size=LARGE_TILE_SZ,
@@ -552,11 +517,16 @@ def flash_paged_attention(
                 config=active_config,
             )
 
-        cur_mask = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), dtype=mask.dtype)
-        cur_mask[:, :] = nl.load(mask[:, nl.ds(context_kv_len, B_F_SIZE)])
+        for i in nl.affine_range(n_tile_q):
+            cur_mask = nl.load(
+                mask[
+                    nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    nl.ds(context_kv_len, LARGE_TILE_SZ),
+                ],
+                dtype=mask.dtype,
+            )
+            for i_q_h in nl.affine_range(q_h_per_k_h):
 
-        for i_q_h in nl.affine_range(q_h_per_k_h):
-            for i in nl.affine_range(n_tile_q):
                 q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
                 q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
                 q_sbuf_tile = nl.load(
@@ -568,32 +538,21 @@ def flash_paged_attention(
                     q_local_tile=q_tile,
                     k=cur_k_tile,
                     v=cur_v_tile,
-                    q_h_per_k_h=q_h_per_k_h,
-                    seqlen_q=seqlen_q,
-                    nheads=h,
                     o_buffer=o_buffer[i, i_q_h],
                     l_buffer=l_buffer[:, i, i_q_h],
                     m_buffer=m_buffer[i, i_q_h],
-                    batch_id=batch_id,
-                    head_id=head_id,
-                    gqa_head_idx=i_q_h,
                     q_tile_idx=i,
-                    local_k_large_tile_idx=0,
                     kernel_dtype=kernel_dtype,
                     acc_type=acc_type,
                     flash_config=active_config,
-                    use_causal_mask=False,
-                    continuous_batching_mask=cur_mask,
+                    use_causal_mask=True,
+                    tile_mask=cur_mask,
                     initialize=False,
                     B_P_SIZE=B_P_SIZE,
                     B_F_SIZE=B_F_SIZE,
                     B_D_SIZE=B_D_SIZE,
-                    dropout_p=0.0,
-                    dropout_p_tensor=None,
-                    seed_tensor=None,
-                    logit_bias_tile=None,
-                    qk_res_buffer=qk_res_buffer[i, i_q_h]
-                    if qk_res_buffer is not None else None,
+                    qk_res_buffer=(qk_res_buffer[i, i_q_h]
+                                   if qk_res_buffer is not None else None),
                 )
 
     # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
@@ -652,7 +611,6 @@ def flash_attn_varlen_nkifunc(
     attn_mask,
     n_kv_head=None,
     head_size=None,
-    B_P_SIZE=128,
     LARGE_TILE_SZ=2048,
     return_debug_tensors=False,
     mixed_precision=True,

From a0597c6b7534c383accab86f9967176a7ece4aae Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 00:40:19 -0800
Subject: [PATCH 30/43] Bump helm/kind-action from 1.10.0 to 1.12.0 (#11612)

---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 556b60d2fca12..9d2e54ce9261a 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -47,7 +47,7 @@ jobs:
           aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
 
       - name: Create kind cluster
-        uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
+        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
 
       - name: Build the Docker image vllm cpu
         run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .

From dd3b4a01f84131c1c3d94d0bf0cbf95a98eec586 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 00:40:25 -0800
Subject: [PATCH 31/43] Bump actions/stale from 9.0.0 to 9.1.0 (#12462)

---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 81e7c9b050760..656f3d3fa7bc4 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
       actions: write
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
         with:
           # Increasing this value ensures that changes to this workflow
           # propagate to all issues and PRs in days rather than months

From 0c7d9effce8121d769e932a22e5753749e826f60 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 16:41:06 +0800
Subject: [PATCH 32/43] Bump helm/chart-testing-action from 2.6.1 to 2.7.0
 (#12463)

---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 9d2e54ce9261a..99365c67c29e8 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -22,7 +22,7 @@ jobs:
           python-version: '3.13'
 
       - name: Set up chart-testing
-        uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
+        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
         with:
           version: v3.10.1
 

From d59def47305487ca523379dd97073f5ab037d663 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 16:41:22 +0800
Subject: [PATCH 33/43] Bump actions/setup-python from 5.3.0 to 5.4.0 (#12672)

---
 .github/workflows/cleanup_pr_body.yml  | 2 +-
 .github/workflows/lint-and-deploy.yaml | 2 +-
 .github/workflows/pre-commit.yml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 0085a1cc22373..50fea0c43cb8c 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Set up Python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
         with:
           python-version: '3.12'
 
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 99365c67c29e8..a4e9acc414d44 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -17,7 +17,7 @@ jobs:
           version: v3.14.4
 
        #Python is required because ct lint runs Yamale and yamllint which require Python.
-      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
         with:
           python-version: '3.13'
 
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 06564969dc778..dc10b9116bbd8 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

From 7c4033acd46749f9da48aa8e347648d47e2f4876 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Wed, 12 Feb 2025 07:34:09 -0300
Subject: [PATCH 34/43] Further reduce the HTTP calls to huggingface.co
 (#13107)

---
 vllm/transformers_utils/config.py | 135 +++++++++++++++++-------------
 1 file changed, 79 insertions(+), 56 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index aade28610b313..4b76509e4541f 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -4,12 +4,14 @@ import enum
 import json
 import os
 import time
+from functools import cache
 from pathlib import Path
-from typing import Any, Dict, Literal, Optional, Type, Union
+from typing import Any, Callable, Dict, Literal, Optional, Type, Union
 
 import huggingface_hub
-from huggingface_hub import (file_exists, hf_hub_download, list_repo_files,
-                             try_to_load_from_cache)
+from huggingface_hub import hf_hub_download
+from huggingface_hub import list_repo_files as hf_list_repo_files
+from huggingface_hub import try_to_load_from_cache
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
                                    HFValidationError, LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
@@ -86,6 +88,65 @@ class ConfigFormat(str, enum.Enum):
     MISTRAL = "mistral"
 
 
+def with_retry(func: Callable[[], Any],
+               log_msg: str,
+               max_retries: int = 2,
+               retry_delay: int = 2):
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logger.error("%s: %s", log_msg, e)
+                raise
+            logger.error("%s: %s, retrying %d of %d", log_msg, e, attempt + 1,
+                         max_retries)
+            time.sleep(retry_delay)
+            retry_delay *= 2
+
+
+# @cache doesn't cache exceptions
+@cache
+def list_repo_files(
+    repo_id: str,
+    *,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> list[str]:
+
+    def lookup_files():
+        try:
+            return hf_list_repo_files(repo_id,
+                                      revision=revision,
+                                      repo_type=repo_type,
+                                      token=token)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            # Don't raise in offline mode,
+            # all we know is that we don't have this
+            # file cached.
+            return []
+
+    return with_retry(lookup_files, "Error retrieving file list")
+
+
+def file_exists(
+    repo_id: str,
+    file_name: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> bool:
+
+    file_list = list_repo_files(repo_id,
+                                repo_type=repo_type,
+                                revision=revision,
+                                token=token)
+    return file_name in file_list
+
+
+# In offline mode the result can be a false negative
 def file_or_path_exists(model: Union[str, Path], config_name: str,
                         revision: Optional[str]) -> bool:
     if Path(model).exists():
@@ -103,31 +164,10 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
     # hf_hub. This will fail in offline mode.
 
     # Call HF to check if the file exists
-    # 2 retries and exponential backoff
-    max_retries = 2
-    retry_delay = 2
-    for attempt in range(max_retries):
-        try:
-            return file_exists(model,
-                               config_name,
-                               revision=revision,
-                               token=HF_TOKEN)
-        except huggingface_hub.errors.OfflineModeIsEnabled:
-            # Don't raise in offline mode,
-            # all we know is that we don't have this
-            # file cached.
-            return False
-        except Exception as e:
-            logger.error(
-                "Error checking file existence: %s, retrying %d of %d", e,
-                attempt + 1, max_retries)
-            if attempt == max_retries - 1:
-                logger.error("Error checking file existence: %s", e)
-                raise
-            time.sleep(retry_delay)
-            retry_delay *= 2
-            continue
-    return False
+    return file_exists(str(model),
+                       config_name,
+                       revision=revision,
+                       token=HF_TOKEN)
 
 
 def patch_rope_scaling(config: PretrainedConfig) -> None:
@@ -208,32 +248,7 @@ def get_config(
                                  revision=revision):
             config_format = ConfigFormat.MISTRAL
         else:
-            # If we're in offline mode and found no valid config format, then
-            # raise an offline mode error to indicate to the user that they
-            # don't have files cached and may need to go online.
-            # This is conveniently triggered by calling file_exists().
-
-            # Call HF to check if the file exists
-            # 2 retries and exponential backoff
-            max_retries = 2
-            retry_delay = 2
-            for attempt in range(max_retries):
-                try:
-                    file_exists(model,
-                                HF_CONFIG_NAME,
-                                revision=revision,
-                                token=HF_TOKEN)
-                except Exception as e:
-                    logger.error(
-                        "Error checking file existence: %s, retrying %d of %d",
-                        e, attempt + 1, max_retries)
-                    if attempt == max_retries:
-                        logger.error("Error checking file existence: %s", e)
-                        raise e
-                    time.sleep(retry_delay)
-                    retry_delay *= 2
-
-            raise ValueError(f"No supported config format found in {model}")
+            raise ValueError(f"No supported config format found in {model}.")
 
     if config_format == ConfigFormat.HF:
         config_dict, _ = PretrainedConfig.get_config_dict(
@@ -339,10 +354,11 @@ def get_hf_file_to_dict(file_name: str,
                                    file_name=file_name,
                                    revision=revision)
 
-    if file_path is None and file_or_path_exists(
-            model=model, config_name=file_name, revision=revision):
+    if file_path is None:
         try:
             hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return None
         except (RepositoryNotFoundError, RevisionNotFoundError,
                 EntryNotFoundError, LocalEntryNotFoundError) as e:
             logger.debug("File or repository not found in hf_hub_download", e)
@@ -363,6 +379,7 @@ def get_hf_file_to_dict(file_name: str,
     return None
 
 
+@cache
 def get_pooling_config(model: str, revision: Optional[str] = 'main'):
     """
     This function gets the pooling and normalize 
@@ -390,6 +407,8 @@ def get_pooling_config(model: str, revision: Optional[str] = 'main'):
     if modules_dict is None:
         return None
 
+    logger.info("Found sentence-transformers modules configuration.")
+
     pooling = next((item for item in modules_dict
                     if item["type"] == "sentence_transformers.models.Pooling"),
                    None)
@@ -408,6 +427,7 @@ def get_pooling_config(model: str, revision: Optional[str] = 'main'):
         if pooling_type_name is not None:
             pooling_type_name = get_pooling_config_name(pooling_type_name)
 
+        logger.info("Found pooling configuration.")
         return {"pooling_type": pooling_type_name, "normalize": normalize}
 
     return None
@@ -435,6 +455,7 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
     return None
 
 
+@cache
 def get_sentence_transformer_tokenizer_config(model: str,
                                               revision: Optional[str] = 'main'
                                               ):
@@ -491,6 +512,8 @@ def get_sentence_transformer_tokenizer_config(model: str,
     if not encoder_dict:
         return None
 
+    logger.info("Found sentence-transformers tokenize configuration.")
+
     if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
         return encoder_dict
     return None

From f1042e86f05cfe93bcadac445e78671ed2e8fddb Mon Sep 17 00:00:00 2001
From: Shiyan Deng <842974287@qq.com>
Date: Wed, 12 Feb 2025 02:36:10 -0800
Subject: [PATCH 35/43] [Misc] AMD Build Improvements (#12923)

---
 csrc/moe/moe_align_sum_kernels.cu           |  2 +-
 csrc/rocm/attention.cu                      |  2 +-
 vllm/model_executor/models/registry.py      | 15 +++++++++++----
 vllm/transformers_utils/configs/__init__.py |  2 +-
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 01dac40446501..c072744f06685 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -3,7 +3,7 @@
 #include <c10/cuda/CUDAGuard.h>
 
 #include <ATen/ATen.h>
-#include <THC/THCAtomics.cuh>
+#include <ATen/cuda/Atomic.cuh>
 
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index ffa9d44610a7f..366b3cdc23aa1 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -1122,4 +1122,4 @@ void paged_attention(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index ebf6a88f21b8e..198b6d134718f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -205,6 +205,14 @@ _VLLM_MODELS = {
     **_FALLBACK_MODEL,
 }
 
+# This variable is used as the args for subprocess.run(). We
+# can modify  this variable to alter the args if needed. e.g.
+# when we use par format to pack things together, sys.executable
+# might not be the target we want to run.
+_SUBPROCESS_COMMAND = [
+    sys.executable, "-m", "vllm.model_executor.models.registry"
+]
+
 
 @dataclass(frozen=True)
 class _ModelInfo:
@@ -502,10 +510,9 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
 
         # cannot use `sys.executable __file__` here because the script
         # contains relative imports
-        returned = subprocess.run(
-            [sys.executable, "-m", "vllm.model_executor.models.registry"],
-            input=input_bytes,
-            capture_output=True)
+        returned = subprocess.run(_SUBPROCESS_COMMAND,
+                                  input=input_bytes,
+                                  capture_output=True)
 
         # check if the subprocess is successful
         try:
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index c484a755ab4ec..9060565596b21 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -45,4 +45,4 @@ __all__ = [
     "SolarConfig",
     "Telechat2Config",
     "UltravoxConfig",
-]
\ No newline at end of file
+]

From f4d97e4fc276b13e1a4ec18f35239fd48695667d Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Wed, 12 Feb 2025 05:39:16 -0500
Subject: [PATCH 36/43] [Bug] [V1] Try fetching stop_reason from EngineOutput
 before checking the request (#13108)

---
 vllm/v1/engine/output_processor.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 7973c62c381ff..1438f9d5a7b42 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -2,7 +2,7 @@
 
 import asyncio
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
@@ -164,6 +164,7 @@ class OutputProcessor:
 
             new_token_ids = engine_core_output.new_token_ids
             finish_reason = engine_core_output.finish_reason
+            stop_reason = engine_core_output.stop_reason
 
             # TODO(andy): prompt logprobs + chunked prefill can
             # result in engine core returning an output for a
@@ -181,9 +182,10 @@ class OutputProcessor:
 
             # 2) Detokenize the token ids into text and check for stop
             #    strings.
-            stop_reason = req_state.detokenizer.update(new_token_ids)
-            if stop_reason:
+            stop_string = req_state.detokenizer.update(new_token_ids)
+            if stop_string and finish_reason != FinishReason.STOP:
                 finish_reason = FinishReason.STOP
+                stop_reason = stop_string
 
             # 3) Compute sample and prompt logprobs for request,
             #    if required.
@@ -250,7 +252,7 @@ class OutputProcessor:
         request_state: RequestState,
         new_token_ids: List[int],
         finish_reason: Optional[FinishReason],
-        stop_reason: Optional[str],
+        stop_reason: Union[int, str, None],
     ) -> Optional[RequestOutput]:
 
         finished = finish_reason is not None

From 985b4a2b1989b2879809d6a3b84c11ac9e1171a3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 12 Feb 2025 19:55:23 +0800
Subject: [PATCH 37/43] [Bugfix] Fix num video tokens calculation for Qwen2-VL
 (#13148)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/qwen2_vl.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index f2071eaff481f..d3294a4d4a3b6 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -800,7 +800,11 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
             preprocessed_size = ImageSize(width=image_width,
                                           height=image_height)
 
-        grid_t = max(num_frames // temporal_patch_size, 1)
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + num_frames % temporal_patch_size
+
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
         grid_h = preprocessed_size.height // patch_size
         grid_w = preprocessed_size.width // patch_size
 

From 314cfade02b28d50349c4df1a7ea0bbdaef589f1 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Wed, 12 Feb 2025 11:29:56 -0500
Subject: [PATCH 38/43] [Frontend] Generate valid tool call IDs when using
 `tokenizer-mode=mistral` (#12332)

---
 tests/mistral_tool_use/__init__.py            |  0
 tests/mistral_tool_use/conftest.py            | 40 +++++++++++++++++++
 .../test_mistral_tool_calls.py                | 29 ++++++++++++++
 tests/mistral_tool_use/utils.py               | 33 +++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       | 16 +++++---
 .../tool_parsers/mistral_tool_parser.py       |  2 +-
 .../transformers_utils/tokenizers/__init__.py |  7 +++-
 vllm/transformers_utils/tokenizers/mistral.py | 30 ++++++++++++++
 8 files changed, 149 insertions(+), 8 deletions(-)
 create mode 100644 tests/mistral_tool_use/__init__.py
 create mode 100644 tests/mistral_tool_use/conftest.py
 create mode 100644 tests/mistral_tool_use/test_mistral_tool_calls.py
 create mode 100644 tests/mistral_tool_use/utils.py

diff --git a/tests/mistral_tool_use/__init__.py b/tests/mistral_tool_use/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/mistral_tool_use/conftest.py b/tests/mistral_tool_use/conftest.py
new file mode 100644
index 0000000000000..39ab01c9b8741
--- /dev/null
+++ b/tests/mistral_tool_use/conftest.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+from .utils import ARGS, CONFIGS, ServerConfig
+
+
+# for each server config, download the model and return the config
+@pytest.fixture(scope="session", params=CONFIGS.keys())
+def server_config(request):
+    config = CONFIGS[request.param]
+
+    if current_platform.is_rocm() and not config.get("supports_rocm", True):
+        pytest.skip("The {} model can't be tested on the ROCm platform".format(
+            config["model"]))
+
+    # download model and tokenizer using transformers
+    snapshot_download(config["model"])
+    yield CONFIGS[request.param]
+
+
+# run this for each server config
+@pytest.fixture(scope="session")
+def server(request, server_config: ServerConfig):
+    model = server_config["model"]
+    args_for_model = server_config["arguments"]
+    with RemoteOpenAIServer(model, ARGS + args_for_model,
+                            max_wait_seconds=480) as server:
+        yield server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    async with server.get_async_client() as async_client:
+        yield async_client
diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/mistral_tool_use/test_mistral_tool_calls.py
new file mode 100644
index 0000000000000..bbb3a07895f6c
--- /dev/null
+++ b/tests/mistral_tool_use/test_mistral_tool_calls.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai
+import pytest
+
+from tests.tool_use.utils import MESSAGES_ASKING_FOR_TOOLS, WEATHER_TOOL
+
+
+# test: a tool_choice with mistral-tokenizer results in an ID of length 9
+@pytest.mark.asyncio
+async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        tool_choice=WEATHER_TOOL,
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None \
+           or len(choice.message.tool_calls) == 1
+    assert len(choice.message.tool_calls[0].id) == 9  # length of 9 for mistral
diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py
new file mode 100644
index 0000000000000..971ed55ca3c02
--- /dev/null
+++ b/tests/mistral_tool_use/utils.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Optional
+
+from typing_extensions import TypedDict
+
+
+class ServerConfig(TypedDict, total=False):
+    model: str
+    arguments: List[str]
+    system_prompt: Optional[str]
+    supports_parallel: Optional[bool]
+    supports_rocm: Optional[bool]
+
+
+ARGS: List[str] = ["--max-model-len", "1024"]
+
+CONFIGS: Dict[str, ServerConfig] = {
+    "mistral": {
+        "model":
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "arguments": [
+            "--tokenizer-mode", "mistral",
+            "--ignore-patterns=\"consolidated.safetensors\""
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
+    },
+}
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 107220d548afc..934bd2a95063c 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -28,12 +28,15 @@ from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+    MistralToolCall)
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
+from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
+                                                truncate_tool_call_ids)
 
 logger = init_logger(__name__)
 
@@ -150,11 +153,12 @@ class OpenAIServingChat(OpenAIServing):
                 return self.create_error_response(
                     "tool_choice = \"required\" is not supported!")
 
-            # because of issues with pydantic we need to potentially
-            # re-serialize the tool_calls field of the request
-            # for more info: see comment in `maybe_serialize_tool_calls`
             if isinstance(tokenizer, MistralTokenizer):
+                # because of issues with pydantic we need to potentially
+                # re-serialize the tool_calls field of the request
+                # for more info: see comment in `maybe_serialize_tool_calls`
                 maybe_serialize_tool_calls(request)
+                truncate_tool_call_ids(request)
 
             if (request.tool_choice == "auto" and
                     not (self.enable_auto_tools and tool_parser is not None)
@@ -745,11 +749,13 @@ class OpenAIServingChat(OpenAIServing):
             elif request.tool_choice and type(
                     request.tool_choice) is ChatCompletionNamedToolChoiceParam:
 
+                tool_call_class = MistralToolCall if isinstance(
+                    tokenizer, MistralTokenizer) else ToolCall
                 message = ChatMessage(
                     role=role,
                     content="",
                     tool_calls=[
-                        ToolCall(function=FunctionCall(
+                        tool_call_class(function=FunctionCall(
                             name=request.tool_choice.function.name,
                             arguments=output.text))
                     ])
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 51354f7c95623..4f04808829925 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -33,7 +33,7 @@ class MistralToolCall(ToolCall):
 
     @staticmethod
     def generate_random_id():
-        # Mistral Tool Call Ids must be alphanumeric with a maximum length of 9.
+        # Mistral Tool Call Ids must be alphanumeric with a length of 9.
         # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
         return "".join(choices(ALPHANUMERIC, k=9))
 
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index 2b64f3fc70569..c12388d9b20bc 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from .mistral import MistralTokenizer, maybe_serialize_tool_calls
+from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
+                      truncate_tool_call_ids)
 
-__all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"]
+__all__ = [
+    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids"
+]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 59131a9d7bfdd..4e76f2dc871b7 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -68,6 +68,36 @@ def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
             request.messages[i]["tool_calls"] = validated_tool_calls
 
 
+def truncate_tool_call_ids(request: "ChatCompletionRequest"):
+    """Truncates tool call IDs for Mistral's ID requirements."""
+    for i, message in enumerate(request.messages):
+        if message.get("role") == 'assistant':
+            tool_calls = message.get("tool_calls", [])
+            for tool_call in tool_calls:
+                if len(tool_call["id"]) > 9:
+                    logger.warning(
+                        "Truncating tool call ID: %s to %s",
+                        tool_call["id"],
+                        tool_call["id"][-9:],
+                    )
+                    tool_call["id"] = tool_call["id"][-9:]
+
+            request.messages[i]["tool_calls"] = tool_calls
+
+        elif message.get("role") in {"tool_results", "tool"}:
+            if "tool_call_id" in message:
+                tool_call_id = message["tool_call_id"]
+
+                if len(tool_call_id) > 9:
+                    logger.warning(
+                        "Truncating tool_call_id: %s to %s",
+                        tool_call_id,
+                        tool_call_id[-9:],
+                    )
+                    tool_call_id = tool_call_id[-9:]
+                request.messages[i]["tool_call_id"] = tool_call_id
+
+
 def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
     repo_cache = os.path.join(
         huggingface_hub.constants.HF_HUB_CACHE,

From 82cabf53a32be91ec08f214e97de06b99d0eef18 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 13 Feb 2025 00:58:24 +0800
Subject: [PATCH 39/43] [Misc] Delete unused LoRA modules (#13151)

---
 tests/lora/test_lora_manager.py         | 18 ++++++++++++------
 vllm/lora/models.py                     |  8 +++++++-
 vllm/lora/punica_wrapper/punica_base.py |  2 +-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 6666f54fdebd1..9fecd11f57afe 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -606,20 +606,26 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
 
     assert isinstance(model.get_submodule("gate_up_proj"),
                       MergedColumnParallelLinearWithLoRA)
+    # Verify packed lora is correct
+    model_lora_clone = model_lora.clone(1)
+    model_lora_clone1 = model_lora1.clone(1)
     assert manager.add_adapter(model_lora)
     assert manager.add_adapter(model_lora1)
 
+    assert model_lora.get_lora("gate_proj") is None
+    assert model_lora.get_lora("up_proj") is None
+    assert model_lora1.get_lora("up_proj") is None
     packed_lora = model_lora.get_lora("gate_up_proj")
     assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
 
     torch.testing.assert_close(packed_lora.lora_a[0],
-                               model_lora.get_lora("gate_proj").lora_a)
+                               model_lora_clone.get_lora("gate_proj").lora_a)
     torch.testing.assert_close(packed_lora.lora_b[0],
-                               model_lora.get_lora("gate_proj").lora_b)
+                               model_lora_clone.get_lora("gate_proj").lora_b)
     torch.testing.assert_close(packed_lora.lora_a[1],
-                               model_lora.get_lora("up_proj").lora_a)
+                               model_lora_clone.get_lora("up_proj").lora_a)
     torch.testing.assert_close(packed_lora.lora_b[1],
-                               model_lora.get_lora("up_proj").lora_b)
+                               model_lora_clone.get_lora("up_proj").lora_b)
 
     packed_lora1 = model_lora1.get_lora("gate_up_proj")
     assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
@@ -627,6 +633,6 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
     assert packed_lora1.lora_a[0] is None
     assert packed_lora1.lora_b[0] is None
     torch.testing.assert_close(packed_lora1.lora_a[1],
-                               model_lora1.get_lora("up_proj").lora_a)
+                               model_lora_clone1.get_lora("up_proj").lora_a)
     torch.testing.assert_close(packed_lora1.lora_b[1],
-                               model_lora1.get_lora("up_proj").lora_b)
+                               model_lora_clone1.get_lora("up_proj").lora_b)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index ef77fd4b74cec..b7403980d0b0d 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -5,7 +5,8 @@ import math
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
+from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Type,
+                    Union)
 
 import safetensors.torch
 import torch
@@ -619,12 +620,14 @@ class LoRAModelManager(AdapterModelManager):
     def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
         for module_name, new_module_names in self.packed_modules.items():
             replacement_loras: List[Optional[LoRALayerWeights]] = []
+            replaced_module: Set[str] = set()
             has_replacement = False
             for r in new_module_names:
                 lora = lora_model.get_lora(r)
                 replacement_loras.append(lora)
                 if lora:
                     has_replacement = True
+                    replaced_module.add(r)
             if not has_replacement:
                 continue
             for i in range(len(replacement_loras)):
@@ -633,6 +636,9 @@ class LoRAModelManager(AdapterModelManager):
                 replacement_loras[i] = None
             lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
                 replacement_loras)
+            # Remove the modules that have been replaced.
+            for module in replaced_module:
+                lora_model.loras.pop(module, None)
 
     def deactivate_adapter(self, adapter_id: int) -> bool:
         return deactivate_adapter(adapter_id, self._active_adapters,
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 1a2282ae9accd..dad98f8e2122e 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -147,7 +147,7 @@ class PunicaWrapperBase(PunicaWrapperABC):
                                               dtype=torch.long,
                                               device=device)
 
-        # 5 is the number of indicies tensors.
+        # 5 is the number of indices tensors.
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices,long_lora_indices
         self.indices_len: List[Optional[int]] = [None] * 5

From 042c3419fad1a89c32a27abe8089af6de960bfce Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 12 Feb 2025 09:06:13 -0800
Subject: [PATCH 40/43] Introduce VLLM_CUDART_SO_PATH to allow users specify
 the .so path (#12998)

Signed-off-by: Lu Fang <lufang@fb.com>
---
 .../device_communicators/cuda_wrapper.py      | 32 ++++++++++++++++++-
 vllm/envs.py                                  |  6 ++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 010caf7ebac97..bc2cfbf321875 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -5,12 +5,14 @@ convenient for use when we just need to call a few functions.
 """
 
 import ctypes
+import glob
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
 # this line makes it possible to directly load `libcudart.so` using `ctypes`
 import torch  # noqa
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -60,6 +62,29 @@ def find_loaded_library(lib_name) -> Optional[str]:
     return path
 
 
+def get_cudart_lib_path_from_env() -> Optional[str]:
+    """
+    In some system, find_loaded_library() may not work. So we allow users to
+    specify the path through environment variable VLLM_CUDART_SO_PATH.
+    """
+    cudart_so_env = envs.VLLM_CUDART_SO_PATH
+    if cudart_so_env is not None:
+        cudart_paths = [
+            cudart_so_env,
+        ]
+        for path in cudart_paths:
+            file_paths = glob.glob(path)
+            if len(file_paths) > 0:
+                logger.info(
+                    "Found cudart library at %s through env var"
+                    "VLLM_CUDART_SO_PATH=%s",
+                    file_paths[0],
+                    cudart_so_env,
+                )
+                return file_paths[0]
+    return None
+
+
 class CudaRTLibrary:
     exported_functions = [
         # ​cudaError_t cudaSetDevice ( int  device )
@@ -105,8 +130,13 @@ class CudaRTLibrary:
     def __init__(self, so_file: Optional[str] = None):
         if so_file is None:
             so_file = find_loaded_library("libcudart")
+            if so_file is None:
+                so_file = get_cudart_lib_path_from_env()
             assert so_file is not None, \
-                "libcudart is not loaded in the current process"
+                (
+                    "libcudart is not loaded in the current process, "
+                    "try setting VLLM_CUDART_SO_PATH"
+                )
         if so_file not in CudaRTLibrary.path_to_library_cache:
             lib = ctypes.CDLL(so_file)
             CudaRTLibrary.path_to_library_cache[so_file] = lib
diff --git a/vllm/envs.py b/vllm/envs.py
index 745b068b7a458..d99c794e69e6c 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,6 +87,7 @@ if TYPE_CHECKING:
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
+    VLLM_CUDART_SO_PATH: Optional[str] = None
 
 
 def get_default_cache_root():
@@ -572,6 +573,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
     # models the alignment is already naturally aligned to 256 bytes.
     "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
     lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
+
+    # In some system, find_loaded_library() may not work. So we allow users to
+    # specify the path through environment variable VLLM_CUDART_SO_PATH.
+    "VLLM_CUDART_SO_PATH":
+    lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
 }
 
 # end-env-vars-definition

From 2c2b560f4829b9dfc91308628c5d6f6928247a0e Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 12 Feb 2025 12:12:22 -0500
Subject: [PATCH 41/43] [CI/Build] Use mypy matcher for pre-commit CI job
 (#13162)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/pre-commit.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index dc10b9116bbd8..6ab63a4027704 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -14,6 +14,7 @@ jobs:
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
       with:
         extra_args: --all-files --hook-stage manual

From 36a08630e80b6176489eef45e15b07724d95a944 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Thu, 13 Feb 2025 01:19:43 +0800
Subject: [PATCH 42/43] [CORE] [QUANT] Support for GPTQModel's `dynamic`
 quantization per module override/control (#7086)

---
 tests/quantization/test_gptq_dynamic.py       | 68 ++++++++++++++
 tests/quantization/test_lm_head.py            | 25 +++--
 vllm/lora/layers.py                           |  2 +-
 .../model_executor/layers/logits_processor.py |  6 +-
 .../layers/quantization/gptq.py               | 47 ++++++++--
 .../layers/quantization/gptq_marlin.py        | 59 +++++++++---
 .../layers/quantization/utils/gptq_utils.py   | 94 +++++++++++++++++++
 .../layers/vocab_parallel_embedding.py        | 36 +++----
 8 files changed, 281 insertions(+), 56 deletions(-)
 create mode 100644 tests/quantization/test_gptq_dynamic.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/gptq_utils.py

diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
new file mode 100644
index 0000000000000..c6f34fef2743b
--- /dev/null
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests whether gptq models with dynamic quantized can be loaded.
+
+Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_dynamic_override)
+
+PROMPT = "On the surface of Mars, we found"
+
+# The first layer is quantized using bits=4, group_size=128
+# The second layer is quantized using bits=8, group_size=32
+# All other layers (layer index >= 2) are not quantized
+MODEL_QUANT = [
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+     True),
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+     False),
+]
+
+
+@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
+def test_gptq_with_dynamic(vllm_runner, model_id: str,
+                           use_marlin_kernel: bool):
+
+    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
+
+    linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
+        GPTQLinearMethod)
+
+    for name, submodule in (vllm_model.model.llm_engine.model_executor.
+                            driver_worker.model_runner.model.named_modules()):
+        if name == "lm_head":
+            assert isinstance(submodule.quant_method, linear_method_cls)
+        elif name == 'model.layers.0.self_attn.qkv_proj':
+            # The first layer is quantized using bits=4, group_size=128
+            # desc_act=True
+            assert isinstance(submodule.quant_method, linear_method_cls)
+            config = submodule.quant_method.quant_config
+            assert config.weight_bits == 4
+            assert config.group_size == 128
+            assert config.desc_act
+        elif name == 'model.layers.1.self_attn.qkv_proj':
+            # The second layer is quantized using bits=8, group_size=32
+            # desc_act=False
+            assert isinstance(submodule.quant_method, linear_method_cls)
+            config = submodule.quant_method.quant_config
+            assert get_dynamic_override(config, layer_name=name,
+                                        key="bits") == 8
+            assert get_dynamic_override(config,
+                                        layer_name=name,
+                                        key="group_size") == 32
+            assert not get_dynamic_override(
+                config, layer_name=name, key="desc_act")
+        elif (name == 'model.layers.2.self_attn.qkv_proj'
+              or name == 'model.layers.2.mlp.gate_up_proj'):
+            # All other layers (layer index >= 2) are not quantized
+            assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
+
+    del vllm_model
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index ec60d8a57559d..20435a287e37a 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
 """
-from typing import Tuple
 
 import pytest
 import torch
@@ -17,31 +16,31 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
 
 PROMPT = "On the surface of Mars, we found"
 
-MODELS_QUANT = [(
-    "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
-    True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
-                ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
+MODELS_QUANT = [
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
+    ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
+    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
+]
 
 
-@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
+@pytest.mark.parametrize("model_id, lm_head_quantized", MODELS_QUANT)
 def test_lm_head(
     vllm_runner,
-    model_lm_head_quant: Tuple[str, bool],
+    model_id: str,
+    lm_head_quantized: bool,
 ) -> None:
-    model, lm_head_quantized = model_lm_head_quant
-
-    with vllm_runner(model, dtype=torch.float16,
+    with vllm_runner(model_id, dtype=torch.float16,
                      max_model_len=2048) as vllm_model:
 
         def check_model(model):
             lm_head_layer = model.lm_head
-
             if lm_head_quantized:
-                assert isinstance(lm_head_layer.linear_method,
+                assert isinstance(lm_head_layer.quant_method,
                                   (GPTQLinearMethod, GPTQMarlinLinearMethod,
                                    MarlinLinearMethod))
             else:
-                assert isinstance(lm_head_layer.linear_method,
+                assert isinstance(lm_head_layer.quant_method,
                                   UnquantizedEmbeddingMethod)
 
         vllm_model.apply_model(check_model)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 9826aeb9dc27e..7f68dae9717ca 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1039,7 +1039,7 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = lm_head.linear_method.apply(lm_head, hidden_states)
+        logits = lm_head.quant_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
             logits += embedding_bias
 
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 0565c6e8be381..9b17429985784 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -108,9 +108,9 @@ class LogitsProcessor(nn.Module):
         embedding_bias: Optional[torch.Tensor],
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = lm_head.linear_method.apply(lm_head,
-                                             hidden_states,
-                                             bias=embedding_bias)
+        logits = lm_head.quant_method.apply(lm_head,
+                                            hidden_states,
+                                            bias=embedding_bias)
 
         # Gather logits for TP
         logits = self._gather_logits(logits)
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 0cb77a7546d1a..6d1f0cc2eb4d5 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -3,16 +3,17 @@
 import enum
 from enum import Enum
 from fractions import Fraction
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -32,7 +33,33 @@ class GPTQConfig(QuantizationConfig):
         group_size: int,
         desc_act: bool,
         lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
     ) -> None:
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
@@ -47,7 +74,8 @@ class GPTQConfig(QuantizationConfig):
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}),"
-                f"lm_head_quantized={self.lm_head_quantized}")
+                f"lm_head_quantized={self.lm_head_quantized}), "
+                f"dynamic={self.dynamic}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -68,19 +96,20 @@ class GPTQConfig(QuantizationConfig):
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
-        return cls(weight_bits, group_size, desc_act, lm_head_quantized)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized,
+                   dynamic)
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["GPTQLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
-            return GPTQLinearMethod(self)
-        return None
+        return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
 
 
 class ExllamaState(Enum):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 84c53b2c16d57..0a9d86b008db6 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -9,17 +9,21 @@ from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               UnquantizedLinearMethod,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported, marlin_moe_permute_scales,
     marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -47,12 +51,41 @@ class GPTQMarlinConfig(QuantizationConfig):
         desc_act: bool,
         is_sym: bool,
         lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
     ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
             desc_act = False
 
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+
+        self.weight_bits = weight_bits
+        self.is_sym = is_sym
+
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
@@ -68,7 +101,8 @@ class GPTQMarlinConfig(QuantizationConfig):
         return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
-                f"lm_head_quantized={self.lm_head_quantized})")
+                f"lm_head_quantized={self.lm_head_quantized}), "
+                f"dynamic={self.dynamic}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -88,6 +122,9 @@ class GPTQMarlinConfig(QuantizationConfig):
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
@@ -95,7 +132,7 @@ class GPTQMarlinConfig(QuantizationConfig):
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
         return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized)
+                   lm_head_quantized, dynamic)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -120,17 +157,15 @@ class GPTQMarlinConfig(QuantizationConfig):
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
-        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
-                                             and self.lm_head_quantized):
-            return GPTQMarlinLinearMethod(self)
-        elif isinstance(layer, FusedMoE):
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod",
+                        UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]:
+        if isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)
-        return None
+        return get_linear_quant_method(self, layer, prefix,
+                                       GPTQMarlinLinearMethod)
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
-        # Extract data from quant config.
         quant_method = quant_config.get("quant_method", "").lower()
         num_bits = quant_config.get("bits")
         group_size = quant_config.get("group_size")
@@ -143,7 +178,7 @@ class GPTQMarlinConfig(QuantizationConfig):
         if quant_method != "gptq":
             return False
 
-        # If we cannot find the info needed in the config, cannot convert.
+        # Marlin conversion is only valid if required properties are found
         if (num_bits is None or group_size is None or sym is None
                 or desc_act is None):
             return False
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
new file mode 100644
index 0000000000000..5b0e6299f4739
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+import re
+from copy import deepcopy
+from typing import Dict, Optional, Union
+
+import torch
+
+from vllm.config import QuantizationConfig
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, UnquantizedEmbeddingMethod)
+
+
+# Match dynamic rules with module name (prefix) and override quantize
+# config if module (prefix) matches a rule
+def override_config(config: QuantizationConfig, prefix: str):
+    weight_bits = get_dynamic_override(config, prefix, "bits",
+                                       config.weight_bits)
+    if isinstance(weight_bits, int):
+        config.weight_bits = weight_bits
+    group_size = get_dynamic_override(config, prefix, "group_size",
+                                      config.group_size)
+    if isinstance(group_size, int):
+        config.group_size = group_size
+    desc_act = get_dynamic_override(config, prefix, "desc_act",
+                                    config.desc_act)
+    if isinstance(desc_act, bool):
+        config.desc_act = desc_act
+
+    config.pack_factor = 32 // config.weight_bits  # packed into int32
+    if config.get_name() == "gptq_marlin":
+        is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
+        if isinstance(is_sym, bool):
+            config.is_sym = is_sym
+
+        if (config.weight_bits, config.is_sym) not in config.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={config.weight_bits}, sym={config.is_sym}")
+
+        config.quant_type = config.TYPE_MAP[(config.weight_bits,
+                                             config.is_sym)]
+    elif config.get_name() == "gptq":
+        if config.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {config.weight_bits} bits.")
+
+
+def get_dynamic_override(
+    config: QuantizationConfig,
+    layer_name: str,
+    key: Optional[str] = None,
+    default_value: Union[int, bool,
+                         None] = None) -> Union[Dict, int, bool, None]:
+    for pattern, pattern_dict in config.dynamic.items():
+        # Negative match: matched modules are excluded from quantized init
+        if pattern.startswith("-:"):
+            if re.match(pattern.removeprefix("-:"), layer_name):
+                return False
+        # Positive match: matched modules have quant properties overrides
+        # base quant config
+        elif re.match(pattern.removeprefix("+:"), layer_name):
+            if key is None:
+                return pattern_dict
+            else:
+                return pattern_dict.get(key, default_value)
+    return default_value
+
+
+def get_linear_quant_method(
+    config: QuantizationConfig,
+    layer: torch.nn.Module,
+    prefix: str,
+    linear_method_cls: type,
+):
+    cloned_config = deepcopy(config)
+    parallel_lm_head_quantized = isinstance(
+        layer, ParallelLMHead) and cloned_config.lm_head_quantized
+    if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
+        # False = skip module, None = no override, else = Positive match
+        if get_dynamic_override(  # noqa: E712
+                cloned_config,  # noqa: E712
+                layer_name=prefix) == False:  # noqa: E712
+            if parallel_lm_head_quantized:
+                return UnquantizedEmbeddingMethod()
+            return UnquantizedLinearMethod()
+
+        if prefix:
+            # Dynamic per module/layer rules may override base config
+            override_config(cloned_config, prefix=prefix)
+
+        return linear_method_cls(cloned_config)
+    return None
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index e409094dd535d..f65dfc3cb3294 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -226,24 +226,24 @@ class VocabParallelEmbedding(torch.nn.Module):
                                                self.tp_size)
         self.embedding_dim = embedding_dim
 
-        linear_method = None
+        quant_method = None
         if quant_config is not None:
-            linear_method = quant_config.get_quant_method(self, prefix=prefix)
-        if linear_method is None:
-            linear_method = UnquantizedEmbeddingMethod()
+            quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if quant_method is None:
+            quant_method = UnquantizedEmbeddingMethod()
 
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
         is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
-        linear_method_implements_embedding = method_has_implemented_embedding(
-            type(linear_method))
-        if is_embedding_layer and not linear_method_implements_embedding:
+        quant_method_implements_embedding = method_has_implemented_embedding(
+            type(quant_method))
+        if is_embedding_layer and not quant_method_implements_embedding:
             raise NotImplementedError(
-                f"The class {type(linear_method).__name__} must implement "
+                f"The class {type(quant_method).__name__} must implement "
                 "the 'embedding' method, see UnquantizedEmbeddingMethod.")
 
-        self.linear_method: QuantizeMethodBase = linear_method
+        self.quant_method: QuantizeMethodBase = quant_method
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -260,13 +260,13 @@ class VocabParallelEmbedding(torch.nn.Module):
             self.shard_indices.added_vocab_end_index -
             self.shard_indices.added_vocab_start_index)
 
-        self.linear_method.create_weights(self,
-                                          self.embedding_dim,
-                                          [self.num_embeddings_per_partition],
-                                          self.embedding_dim,
-                                          self.num_embeddings_padded,
-                                          params_dtype=params_dtype,
-                                          weight_loader=self.weight_loader)
+        self.quant_method.create_weights(self,
+                                         self.embedding_dim,
+                                         [self.num_embeddings_per_partition],
+                                         self.embedding_dim,
+                                         self.num_embeddings_padded,
+                                         params_dtype=params_dtype,
+                                         weight_loader=self.weight_loader)
 
     @classmethod
     def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
@@ -412,8 +412,8 @@ class VocabParallelEmbedding(torch.nn.Module):
         else:
             masked_input = input_
         # Get the embeddings.
-        output_parallel = self.linear_method.embedding(self,
-                                                       masked_input.long())
+        output_parallel = self.quant_method.embedding(self,
+                                                      masked_input.long())
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)

From 09972e716c4a90bfd4385540c9f478e18b4efb2d Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 12 Feb 2025 12:19:53 -0500
Subject: [PATCH 43/43] [Bugfix] Allow fallback to AWQ from AWQMarlin at
 per-layer granularity (#13119)

---
 vllm/model_executor/layers/linear.py          | 41 ++++++++++---------
 .../layers/quantization/awq_marlin.py         | 28 ++++++++-----
 .../layers/quantization/moe_wna16.py          |  9 ++--
 .../layers/quantization/utils/marlin_utils.py | 15 +++++++
 4 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dad16112082cb..521724765bebf 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -290,29 +290,30 @@ class ColumnParallelLinear(LinearBase):
                  quant_config: Optional[QuantizationConfig] = None,
                  output_sizes: Optional[list[int]] = None,
                  prefix: str = ""):
+        # Divide the weight matrix along the last dimension.
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = input_size
+        self.output_size_per_partition = divide(output_size, self.tp_size)
+        self.output_partition_sizes = [self.output_size_per_partition]
+        # If QKV or MergedColumn, use output size of each partition.
+        if hasattr(self, "output_sizes"):
+            self.output_partition_sizes = [
+                divide(output_size, self.tp_size)
+                for output_size in self.output_sizes
+            ]
+
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config, prefix)
 
         self.gather_output = gather_output
 
-        # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
-        assert self.quant_method is not None
-        self.output_size_per_partition = divide(self.output_size, tp_size)
-        self.output_partition_sizes = [self.output_size_per_partition]
-        # If QKV or MergedColumn, use output size of each partition.
-        if hasattr(self, "output_sizes"):
-            self.output_partition_sizes = [
-                divide(output_size, tp_size)
-                for output_size in self.output_sizes
-            ]
-
         if output_sizes is None:
             output_sizes = [output_size]
 
+        assert self.quant_method is not None
         self.quant_method.create_weights(
             layer=self,
-            input_size_per_partition=self.input_size,
+            input_size_per_partition=self.input_size_per_partition,
             output_partition_sizes=self.output_partition_sizes,
             input_size=self.input_size,
             output_size=self.output_size,
@@ -1044,22 +1045,24 @@ class RowParallelLinear(LinearBase):
                  reduce_results: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
+        # Divide the weight matrix along the first dimension.
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, self.tp_size)
+        self.output_size_per_partition = output_size
+        self.output_partition_sizes = [output_size]
+
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config, prefix)
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
 
-        # Divide the weight matrix along the last dimension.
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.input_size_per_partition = divide(input_size, self.tp_size)
         assert self.quant_method is not None
-
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size_per_partition,
-            output_partition_sizes=[self.output_size],
+            output_partition_sizes=self.output_partition_sizes,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 8849ba2928228..a43b2e597c1e6 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -13,15 +13,17 @@ from vllm.model_executor.layers.fused_moe.layer import (
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
                                                set_weight_attrs)
-from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq
+from vllm.model_executor.layers.quantization.awq import (AWQConfig,
+                                                         is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
-    marlin_permute_scales, moe_awq_to_marlin_zero_points,
-    verify_marlin_supported, verify_marlin_supports_shape)
+    check_marlin_supports_layer, marlin_make_empty_g_idx,
+    marlin_make_workspace, marlin_moe_permute_scales, marlin_permute_scales,
+    moe_awq_to_marlin_zero_points, verify_marlin_supported,
+    verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            PackedvLLMParameter)
@@ -40,18 +42,17 @@ class AWQMarlinConfig(QuantizationConfig):
         8: scalar_types.uint8,
     }
 
-    def __init__(self,
-                 weight_bits: int,
-                 group_size: int,
-                 zero_point: bool,
+    def __init__(self, weight_bits: int, group_size: int, zero_point: bool,
                  lm_head_quantized: bool,
-                 modules_to_not_convert: Optional[List[str]] = None) -> None:
+                 modules_to_not_convert: Optional[List[str]],
+                 full_config: Dict[str, Any]) -> None:
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.zero_point = zero_point
         self.lm_head_quantized = lm_head_quantized
         self.weight_bits = weight_bits
         self.modules_to_not_convert = modules_to_not_convert or []
+        self.full_config = full_config
 
         if self.weight_bits not in self.TYPE_MAP:
             raise ValueError(f"Unsupported num_bits = {self.weight_bits}. "
@@ -96,7 +97,7 @@ class AWQMarlinConfig(QuantizationConfig):
         modules_to_not_convert = cls.get_from_keys_or(
             config, ["modules_to_not_convert"], None)
         return cls(weight_bits, group_size, zero_point, lm_head_quantized,
-                   modules_to_not_convert)
+                   modules_to_not_convert, config)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -124,6 +125,13 @@ class AWQMarlinConfig(QuantizationConfig):
             (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
                 return UnquantizedLinearMethod()
+            # Check if the layer is supported by AWQMarlin.
+            if not check_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    f"Layer '{prefix}' is not supported by AWQMarlin. "
+                    "Falling back to unoptimized AWQ kernels.")
+                return AWQConfig.from_config(
+                    self.full_config).get_quant_method(layer, prefix)
             return AWQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return AWQMoEMethod(self)
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 56fa597e20131..b9460e7d7985b 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -16,6 +16,8 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    check_marlin_supports_layer)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
@@ -87,8 +89,8 @@ class MoeWNA16Config(QuantizationConfig):
             modules_to_not_convert = []
         elif linear_quant_method == "awq":
             has_zp = cls.get_from_keys(config, ["zero_point"])
-            modules_to_not_convert = cls.get_from_keys(
-                config, ["modules_to_not_convert"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None)
         else:
             raise ValueError("moe_wna16 only support gptq and awq.")
 
@@ -135,7 +137,8 @@ class MoeWNA16Config(QuantizationConfig):
                     return GPTQConfig.from_config(
                         self.full_config).get_quant_method(layer, prefix)
             elif self.linear_quant_method == "awq":
-                if self.use_marlin:
+                if self.use_marlin and check_marlin_supports_layer(
+                        layer, self.group_size):
                     return AWQMarlinConfig.from_config(
                         self.full_config).get_quant_method(layer, prefix)
                 else:
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 3beba30832441..05e37251aa161 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -6,6 +6,7 @@ import numpy
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import LinearBase
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -135,6 +136,20 @@ def check_marlin_supports_shape(output_size_per_partition: int,
     return True, None
 
 
+def check_marlin_supports_layer(layer: LinearBase, group_size: int) \
+                                    -> bool:
+    output_size_per_partition = getattr(layer, "output_size_per_partition",
+                                        None) or layer.output_size
+    input_size_per_partition = getattr(layer, "input_size_per_partition",
+                                       None) or layer.input_size
+
+    return check_marlin_supports_shape(
+        output_size_per_partition=output_size_per_partition,
+        input_size_per_partition=input_size_per_partition,
+        input_size=layer.input_size,
+        group_size=group_size)[0]
+
+
 def marlin_make_workspace(output_size_per_partition: int,
                           device: torch.device) -> torch.Tensor:
     max_workspace_size = (output_size_per_partition //