From 3d833aa75956ef5e8969a9361c97a06c8eb8653f Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 2 Jul 2025 21:20:21 +0000
Subject: [PATCH] cleanup

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 vllm/compilation/cuda_piecewise_backend.py    |  1 -
 vllm/compilation/decorators.py                |  1 -
 vllm/forward_context.py                       |  1 -
 .../layers/fused_moe/fused_moe.py             |  2 +-
 vllm/v1/attention/backends/utils.py           | 45 -------------------
 5 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
index 18c3dfe0f171e..8c49ea6cc1074 100644
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -106,7 +106,6 @@ class CUDAPiecewiseBackend:
             end_monitoring_torch_compile(self.vllm_config)
 
     def __call__(self, *args) -> Any:
-        # logger.info("CUDA BACKEND CALL")
         if not self.first_run_finished:
             self.first_run_finished = True
             self.check_for_ending_compilation()
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 5f1b268a1d6fe..05e4ca9f08b36 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -170,7 +170,6 @@ def _support_torch_compile(
         # e.g. TPU has the compilation logic in model runner, so we don't
         # need to compile the model inside.
         if self.do_not_compile or torch.compiler.is_compiling():
-            # logger.info("SKIPPING COMPILATION")
             return self.forward(*args, **kwargs)
 
         # the first compilation needs to have dynamic shapes marked
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 4671db4971132..ed86247dd9d28 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -91,7 +91,6 @@ class DPMetadata:
 
         # If num_tokens_across_dp is None, it will be computed by all_reduce
         # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
-        # print(f"num_tokens_across_dp {num_tokens_across_dp} batchsize {batchsize}")
         assert (num_tokens_across_dp is None
                 or num_tokens_across_dp[dp_rank] == batchsize)
         if num_tokens_across_dp is None:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 3e9ef23b6ef26..f22884b8a1a5a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -903,7 +903,7 @@ def fused_topk(
 
 
 # This is used by the Deepseek-V2 and Deepseek-V3 model
-# @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def grouped_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index f12bdfc5f35d6..0bd7eaf7f8154 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -19,7 +19,6 @@ import vllm.envs as envs
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout)
 from vllm.logger import init_logger
-from vllm.v1.worker.block_table import BlockTable
 
 logger = init_logger(__name__)
 
@@ -32,8 +31,6 @@ class CommonAttentionMetadata:
     """
 
     query_start_loc: torch.Tensor
-
-    # query_start_loc_cpu: torch.Tensor
     """(batch_size + 1,), the start location of each request in query Tensor"""
     seq_lens: torch.Tensor
     """(batch_size,), the length of each request including both computed tokens
@@ -46,48 +43,6 @@ class CommonAttentionMetadata:
     max_query_len: int
     """Longest query in batch"""
 
-    # block_table: BlockTable
-
-    # def compute_request_slice(self, token_slice: slice) -> slice:
-    #     """
-    #     return 
-    #     - num_decodes: number of decode requests
-    #     - num_prefills: number of prefill requests
-    #     - num_decode_tokens: number of decode tokens
-    #     - num_prefill_tokens: number of prefill tokens
-    #     """
-    #     if self.max_query_len == 1:
-    #         # Pure decode
-    #         return token_slice
-    #     else:
-    #         # Find the first query_start_loc that's greater than the token_slice.start
-    #         first_reqest = (self.query_start_loc_cpu >= token_slice.start).int().argmax(dim=-1).item()
-    #         last_request = (self.query_start_loc_cpu < token_slice.stop).int().argmax(dim=-1).item()
-    #         return slice(first_reqest, last_request)
-
-    # # Slice the current CommonAttentionMetatdata into two
-    # def _slice(self, token_slice: slice) -> CommonAttentionMetadata:
-    #     request_slice = self.compute_request_slice(token_slice)
-    #     query_start_loc = slice_query_start_locs(
-    #         self.query_start_loc, request_slice)
-        
-    #     seq_lens = self.seq_lens[request_slice]
-    #     num_requests = request_slice.stop - request_slice.start
-    #     num_actual_tokens = token_slice.stop - token_slice.start
-    #     #TODO(Sage) update this for prefill
-    #     max_query_len = 1
-
-    #     block_table = self.block_table
-    #     block_table_tensor = block_table.get_device_tensor()[req_slice]
-    #     block_table.slot_mapping[token_slice].copy_(
-    #         block_table.slot_mapping_cpu[token_slice],
-    #         non_blocking=True)
-    #     block_table.slot_mapping[token_slice.stop:].fill_(-1)
-    #     slot_mapping = block_table.slot_mapping[token_slice]
-
-    #     pass
-
-
 M = TypeVar("M")