Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
Sage Moore 2025-07-02 21:20:21 +00:00
parent 0e499c4f4d
commit 3d833aa759
5 changed files with 1 additions and 49 deletions

View File

@ -106,7 +106,6 @@ class CUDAPiecewiseBackend:
end_monitoring_torch_compile(self.vllm_config)
def __call__(self, *args) -> Any:
# logger.info("CUDA BACKEND CALL")
if not self.first_run_finished:
self.first_run_finished = True
self.check_for_ending_compilation()

View File

@ -170,7 +170,6 @@ def _support_torch_compile(
# e.g. TPU has the compilation logic in model runner, so we don't
# need to compile the model inside.
if self.do_not_compile or torch.compiler.is_compiling():
# logger.info("SKIPPING COMPILATION")
return self.forward(*args, **kwargs)
# the first compilation needs to have dynamic shapes marked

View File

@ -91,7 +91,6 @@ class DPMetadata:
# If num_tokens_across_dp is None, it will be computed by all_reduce
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
# print(f"num_tokens_across_dp {num_tokens_across_dp} batchsize {batchsize}")
assert (num_tokens_across_dp is None
or num_tokens_across_dp[dp_rank] == batchsize)
if num_tokens_across_dp is None:

View File

@ -903,7 +903,7 @@ def fused_topk(
# This is used by the Deepseek-V2 and Deepseek-V3 model
# @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
def grouped_topk(
hidden_states: torch.Tensor,
gating_output: torch.Tensor,

View File

@ -19,7 +19,6 @@ import vllm.envs as envs
from vllm.distributed.kv_transfer.kv_connector.utils import (
get_kv_connector_cache_layout)
from vllm.logger import init_logger
from vllm.v1.worker.block_table import BlockTable
logger = init_logger(__name__)
@ -32,8 +31,6 @@ class CommonAttentionMetadata:
"""
query_start_loc: torch.Tensor
# query_start_loc_cpu: torch.Tensor
"""(batch_size + 1,), the start location of each request in query Tensor"""
seq_lens: torch.Tensor
"""(batch_size,), the length of each request including both computed tokens
@ -46,48 +43,6 @@ class CommonAttentionMetadata:
max_query_len: int
"""Longest query in batch"""
# block_table: BlockTable
# def compute_request_slice(self, token_slice: slice) -> slice:
# """
# return
# - num_decodes: number of decode requests
# - num_prefills: number of prefill requests
# - num_decode_tokens: number of decode tokens
# - num_prefill_tokens: number of prefill tokens
# """
# if self.max_query_len == 1:
# # Pure decode
# return token_slice
# else:
# # Find the first query_start_loc that's greater than the token_slice.start
# first_reqest = (self.query_start_loc_cpu >= token_slice.start).int().argmax(dim=-1).item()
# last_request = (self.query_start_loc_cpu < token_slice.stop).int().argmax(dim=-1).item()
# return slice(first_reqest, last_request)
# # Slice the current CommonAttentionMetatdata into two
# def _slice(self, token_slice: slice) -> CommonAttentionMetadata:
# request_slice = self.compute_request_slice(token_slice)
# query_start_loc = slice_query_start_locs(
# self.query_start_loc, request_slice)
# seq_lens = self.seq_lens[request_slice]
# num_requests = request_slice.stop - request_slice.start
# num_actual_tokens = token_slice.stop - token_slice.start
# #TODO(Sage) update this for prefill
# max_query_len = 1
# block_table = self.block_table
# block_table_tensor = block_table.get_device_tensor()[req_slice]
# block_table.slot_mapping[token_slice].copy_(
# block_table.slot_mapping_cpu[token_slice],
# non_blocking=True)
# block_table.slot_mapping[token_slice.stop:].fill_(-1)
# slot_mapping = block_table.slot_mapping[token_slice]
# pass
M = TypeVar("M")