From 6db94571d74e3dbc6e38bec7b4bd2913eea36cc9 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 25 Mar 2025 11:43:48 +0800
Subject: [PATCH] [Misc] Remove LoRA log (#15388)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/config.py                         |  6 ------
 vllm/lora/punica_wrapper/punica_gpu.py | 14 +++++---------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 989e5b47516ea..a2e83af3ab450 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2373,12 +2373,6 @@ class LoRAConfig:
             self.lora_dtype = model_config.dtype
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
-        if model_config.quantization and model_config.quantization not in [
-                "awq", "gptq"
-        ]:
-            # TODO support marlin
-            logger.warning("%s quantization is not tested with LoRA yet.",
-                           model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
         # Reminder: Please update docs/source/features/compatibility_matrix.md
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index be9cbe244a819..bb6d2808e46a1 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -78,10 +78,6 @@ class PunicaWrapperGPU(PunicaWrapperBase):
                                          ...], scale: float, **kwargs):
         """
         Performs GEMM  for multiple slices of lora_a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `_shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the _shrink_decode function
-        should be called.
             
         Semantics:
         for i in range(len(lora_a_stacked)):
@@ -129,7 +125,7 @@ class PunicaWrapperGPU(PunicaWrapperBase):
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
-            add_inputs (bool):  Defaults to True.
+            add_inputs (bool): Defaults to True.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
@@ -226,7 +222,7 @@ class PunicaWrapperGPU(PunicaWrapperBase):
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)
-            # We set the buffer to be float32 by default ,refer to:
+            # We set the buffer to be float32 by default, refer to:
             # https://github.com/triton-lang/triton/issues/1387
             buffer = torch.zeros(  # type: ignore
                 (len(output_slices), x.size(0), r),
@@ -268,16 +264,16 @@ class PunicaWrapperGPU(PunicaWrapperBase):
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
             lora_a_stacked (torch.Tensor): lora_a's weights.
-            lora_b_stacked (torch.Tensor):lora_b's weights.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
             scale (float): Scaling factor.
-            buffer (Optional[torch.Tensor]):Default to None.
+            buffer (Optional[torch.Tensor]): Default to None.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
         x = x.view(-1, x.shape[-1])
         r = lora_b_stacked.size(-1)
         if buffer is None:
-            # We set the buffer to be float32 by default ,refer to:
+            # We set the buffer to be float32 by default, refer to:
             # https://github.com/triton-lang/triton/issues/1387
             buffer = torch.zeros((x.size(0), r),
                                  dtype=torch.float32,