From 3d833aa75956ef5e8969a9361c97a06c8eb8653f Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 2 Jul 2025 21:20:21 +0000 Subject: [PATCH] cleanup Signed-off-by: Sage Moore --- vllm/compilation/cuda_piecewise_backend.py | 1 - vllm/compilation/decorators.py | 1 - vllm/forward_context.py | 1 - .../layers/fused_moe/fused_moe.py | 2 +- vllm/v1/attention/backends/utils.py | 45 ------------------- 5 files changed, 1 insertion(+), 49 deletions(-) diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py index 18c3dfe0f171e..8c49ea6cc1074 100644 --- a/vllm/compilation/cuda_piecewise_backend.py +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -106,7 +106,6 @@ class CUDAPiecewiseBackend: end_monitoring_torch_compile(self.vllm_config) def __call__(self, *args) -> Any: - # logger.info("CUDA BACKEND CALL") if not self.first_run_finished: self.first_run_finished = True self.check_for_ending_compilation() diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 5f1b268a1d6fe..05e4ca9f08b36 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -170,7 +170,6 @@ def _support_torch_compile( # e.g. TPU has the compilation logic in model runner, so we don't # need to compile the model inside. if self.do_not_compile or torch.compiler.is_compiling(): - # logger.info("SKIPPING COMPILATION") return self.forward(*args, **kwargs) # the first compilation needs to have dynamic shapes marked diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 4671db4971132..ed86247dd9d28 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -91,7 +91,6 @@ class DPMetadata: # If num_tokens_across_dp is None, it will be computed by all_reduce # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize - # print(f"num_tokens_across_dp {num_tokens_across_dp} batchsize {batchsize}") assert (num_tokens_across_dp is None or num_tokens_across_dp[dp_rank] == batchsize) if num_tokens_across_dp is None: diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3e9ef23b6ef26..f22884b8a1a5a 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -903,7 +903,7 @@ def fused_topk( # This is used by the Deepseek-V2 and Deepseek-V3 model -# @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) def grouped_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index f12bdfc5f35d6..0bd7eaf7f8154 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -19,7 +19,6 @@ import vllm.envs as envs from vllm.distributed.kv_transfer.kv_connector.utils import ( get_kv_connector_cache_layout) from vllm.logger import init_logger -from vllm.v1.worker.block_table import BlockTable logger = init_logger(__name__) @@ -32,8 +31,6 @@ class CommonAttentionMetadata: """ query_start_loc: torch.Tensor - - # query_start_loc_cpu: torch.Tensor """(batch_size + 1,), the start location of each request in query Tensor""" seq_lens: torch.Tensor """(batch_size,), the length of each request including both computed tokens @@ -46,48 +43,6 @@ class CommonAttentionMetadata: max_query_len: int """Longest query in batch""" - # block_table: BlockTable - - # def compute_request_slice(self, token_slice: slice) -> slice: - # """ - # return - # - num_decodes: number of decode requests - # - num_prefills: number of prefill requests - # - num_decode_tokens: number of decode tokens - # - num_prefill_tokens: number of prefill tokens - # """ - # if self.max_query_len == 1: - # # Pure decode - # return token_slice - # else: - # # Find the first query_start_loc that's greater than the token_slice.start - # first_reqest = (self.query_start_loc_cpu >= token_slice.start).int().argmax(dim=-1).item() - # last_request = (self.query_start_loc_cpu < token_slice.stop).int().argmax(dim=-1).item() - # return slice(first_reqest, last_request) - - # # Slice the current CommonAttentionMetatdata into two - # def _slice(self, token_slice: slice) -> CommonAttentionMetadata: - # request_slice = self.compute_request_slice(token_slice) - # query_start_loc = slice_query_start_locs( - # self.query_start_loc, request_slice) - - # seq_lens = self.seq_lens[request_slice] - # num_requests = request_slice.stop - request_slice.start - # num_actual_tokens = token_slice.stop - token_slice.start - # #TODO(Sage) update this for prefill - # max_query_len = 1 - - # block_table = self.block_table - # block_table_tensor = block_table.get_device_tensor()[req_slice] - # block_table.slot_mapping[token_slice].copy_( - # block_table.slot_mapping_cpu[token_slice], - # non_blocking=True) - # block_table.slot_mapping[token_slice.stop:].fill_(-1) - # slot_mapping = block_table.slot_mapping[token_slice] - - # pass - - M = TypeVar("M")