mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-06 23:37:09 +08:00
cleanup
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
0e499c4f4d
commit
3d833aa759
@ -106,7 +106,6 @@ class CUDAPiecewiseBackend:
|
||||
end_monitoring_torch_compile(self.vllm_config)
|
||||
|
||||
def __call__(self, *args) -> Any:
|
||||
# logger.info("CUDA BACKEND CALL")
|
||||
if not self.first_run_finished:
|
||||
self.first_run_finished = True
|
||||
self.check_for_ending_compilation()
|
||||
|
||||
@ -170,7 +170,6 @@ def _support_torch_compile(
|
||||
# e.g. TPU has the compilation logic in model runner, so we don't
|
||||
# need to compile the model inside.
|
||||
if self.do_not_compile or torch.compiler.is_compiling():
|
||||
# logger.info("SKIPPING COMPILATION")
|
||||
return self.forward(*args, **kwargs)
|
||||
|
||||
# the first compilation needs to have dynamic shapes marked
|
||||
|
||||
@ -91,7 +91,6 @@ class DPMetadata:
|
||||
|
||||
# If num_tokens_across_dp is None, it will be computed by all_reduce
|
||||
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
|
||||
# print(f"num_tokens_across_dp {num_tokens_across_dp} batchsize {batchsize}")
|
||||
assert (num_tokens_across_dp is None
|
||||
or num_tokens_across_dp[dp_rank] == batchsize)
|
||||
if num_tokens_across_dp is None:
|
||||
|
||||
@ -903,7 +903,7 @@ def fused_topk(
|
||||
|
||||
|
||||
# This is used by the Deepseek-V2 and Deepseek-V3 model
|
||||
# @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
|
||||
@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
|
||||
def grouped_topk(
|
||||
hidden_states: torch.Tensor,
|
||||
gating_output: torch.Tensor,
|
||||
|
||||
@ -19,7 +19,6 @@ import vllm.envs as envs
|
||||
from vllm.distributed.kv_transfer.kv_connector.utils import (
|
||||
get_kv_connector_cache_layout)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.worker.block_table import BlockTable
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -32,8 +31,6 @@ class CommonAttentionMetadata:
|
||||
"""
|
||||
|
||||
query_start_loc: torch.Tensor
|
||||
|
||||
# query_start_loc_cpu: torch.Tensor
|
||||
"""(batch_size + 1,), the start location of each request in query Tensor"""
|
||||
seq_lens: torch.Tensor
|
||||
"""(batch_size,), the length of each request including both computed tokens
|
||||
@ -46,48 +43,6 @@ class CommonAttentionMetadata:
|
||||
max_query_len: int
|
||||
"""Longest query in batch"""
|
||||
|
||||
# block_table: BlockTable
|
||||
|
||||
# def compute_request_slice(self, token_slice: slice) -> slice:
|
||||
# """
|
||||
# return
|
||||
# - num_decodes: number of decode requests
|
||||
# - num_prefills: number of prefill requests
|
||||
# - num_decode_tokens: number of decode tokens
|
||||
# - num_prefill_tokens: number of prefill tokens
|
||||
# """
|
||||
# if self.max_query_len == 1:
|
||||
# # Pure decode
|
||||
# return token_slice
|
||||
# else:
|
||||
# # Find the first query_start_loc that's greater than the token_slice.start
|
||||
# first_reqest = (self.query_start_loc_cpu >= token_slice.start).int().argmax(dim=-1).item()
|
||||
# last_request = (self.query_start_loc_cpu < token_slice.stop).int().argmax(dim=-1).item()
|
||||
# return slice(first_reqest, last_request)
|
||||
|
||||
# # Slice the current CommonAttentionMetatdata into two
|
||||
# def _slice(self, token_slice: slice) -> CommonAttentionMetadata:
|
||||
# request_slice = self.compute_request_slice(token_slice)
|
||||
# query_start_loc = slice_query_start_locs(
|
||||
# self.query_start_loc, request_slice)
|
||||
|
||||
# seq_lens = self.seq_lens[request_slice]
|
||||
# num_requests = request_slice.stop - request_slice.start
|
||||
# num_actual_tokens = token_slice.stop - token_slice.start
|
||||
# #TODO(Sage) update this for prefill
|
||||
# max_query_len = 1
|
||||
|
||||
# block_table = self.block_table
|
||||
# block_table_tensor = block_table.get_device_tensor()[req_slice]
|
||||
# block_table.slot_mapping[token_slice].copy_(
|
||||
# block_table.slot_mapping_cpu[token_slice],
|
||||
# non_blocking=True)
|
||||
# block_table.slot_mapping[token_slice.stop:].fill_(-1)
|
||||
# slot_mapping = block_table.slot_mapping[token_slice]
|
||||
|
||||
# pass
|
||||
|
||||
|
||||
M = TypeVar("M")
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user