cleanup

Signed-off-by: Sage Moore <sage@neuralmagic.com>
2026-07-29 13:54:29 +08:00 · 2025-07-02 21:20:21 +00:00 · 2025-07-02 21:20:21 +00:00 · 3d833aa759
commit 3d833aa759
parent 0e499c4f4d
5 changed files with 1 additions and 49 deletions
--- a/vllm/compilation/cuda_piecewise_backend.py
+++ b/vllm/compilation/cuda_piecewise_backend.py
@ -106,7 +106,6 @@ class CUDAPiecewiseBackend:
            end_monitoring_torch_compile(self.vllm_config)
    def __call__(self, *args) -> Any:
        # logger.info("CUDA BACKEND CALL")
        if not self.first_run_finished:
            self.first_run_finished = True
            self.check_for_ending_compilation()
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@ -170,7 +170,6 @@ def _support_torch_compile(
        # e.g. TPU has the compilation logic in model runner, so we don't
        # need to compile the model inside.
        if self.do_not_compile or torch.compiler.is_compiling():
            # logger.info("SKIPPING COMPILATION")
            return self.forward(*args, **kwargs)
        # the first compilation needs to have dynamic shapes marked
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@ -91,7 +91,6 @@ class DPMetadata:
        # If num_tokens_across_dp is None, it will be computed by all_reduce
        # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
        # print(f"num_tokens_across_dp {num_tokens_across_dp} batchsize {batchsize}")
        assert (num_tokens_across_dp is None
                or num_tokens_across_dp[dp_rank] == batchsize)
        if num_tokens_across_dp is None:
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@ -903,7 +903,7 @@ def fused_topk(
 # This is used by the Deepseek-V2 and Deepseek-V3 model
-# @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
 def grouped_topk(
    hidden_states: torch.Tensor,
    gating_output: torch.Tensor,
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@ -19,7 +19,6 @@ import vllm.envs as envs
 from vllm.distributed.kv_transfer.kv_connector.utils import (
    get_kv_connector_cache_layout)
 from vllm.logger import init_logger
 from vllm.v1.worker.block_table import BlockTable
 logger = init_logger(__name__)
@ -32,8 +31,6 @@ class CommonAttentionMetadata:
    """
    query_start_loc: torch.Tensor
    # query_start_loc_cpu: torch.Tensor
    """(batch_size + 1,), the start location of each request in query Tensor"""
    seq_lens: torch.Tensor
    """(batch_size,), the length of each request including both computed tokens
@ -46,48 +43,6 @@ class CommonAttentionMetadata:
    max_query_len: int
    """Longest query in batch"""
    # block_table: BlockTable
    # def compute_request_slice(self, token_slice: slice) -> slice:
    #     """
    #     return 
    #     - num_decodes: number of decode requests
    #     - num_prefills: number of prefill requests
    #     - num_decode_tokens: number of decode tokens
    #     - num_prefill_tokens: number of prefill tokens
    #     """
    #     if self.max_query_len == 1:
    #         # Pure decode
    #         return token_slice
    #     else:
    #         # Find the first query_start_loc that's greater than the token_slice.start
    #         first_reqest = (self.query_start_loc_cpu >= token_slice.start).int().argmax(dim=-1).item()
    #         last_request = (self.query_start_loc_cpu < token_slice.stop).int().argmax(dim=-1).item()
    #         return slice(first_reqest, last_request)
    # # Slice the current CommonAttentionMetatdata into two
    # def _slice(self, token_slice: slice) -> CommonAttentionMetadata:
    #     request_slice = self.compute_request_slice(token_slice)
    #     query_start_loc = slice_query_start_locs(
    #         self.query_start_loc, request_slice)
    #     seq_lens = self.seq_lens[request_slice]
    #     num_requests = request_slice.stop - request_slice.start
    #     num_actual_tokens = token_slice.stop - token_slice.start
    #     #TODO(Sage) update this for prefill
    #     max_query_len = 1
    #     block_table = self.block_table
    #     block_table_tensor = block_table.get_device_tensor()[req_slice]
    #     block_table.slot_mapping[token_slice].copy_(
    #         block_table.slot_mapping_cpu[token_slice],
    #         non_blocking=True)
    #     block_table.slot_mapping[token_slice.stop:].fill_(-1)
    #     slot_mapping = block_table.slot_mapping[token_slice]
    #     pass
 M = TypeVar("M")