mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-22 14:17:54 +08:00
cleanup
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
0e499c4f4d
commit
3d833aa759
@ -106,7 +106,6 @@ class CUDAPiecewiseBackend:
|
|||||||
end_monitoring_torch_compile(self.vllm_config)
|
end_monitoring_torch_compile(self.vllm_config)
|
||||||
|
|
||||||
def __call__(self, *args) -> Any:
|
def __call__(self, *args) -> Any:
|
||||||
# logger.info("CUDA BACKEND CALL")
|
|
||||||
if not self.first_run_finished:
|
if not self.first_run_finished:
|
||||||
self.first_run_finished = True
|
self.first_run_finished = True
|
||||||
self.check_for_ending_compilation()
|
self.check_for_ending_compilation()
|
||||||
|
|||||||
@ -170,7 +170,6 @@ def _support_torch_compile(
|
|||||||
# e.g. TPU has the compilation logic in model runner, so we don't
|
# e.g. TPU has the compilation logic in model runner, so we don't
|
||||||
# need to compile the model inside.
|
# need to compile the model inside.
|
||||||
if self.do_not_compile or torch.compiler.is_compiling():
|
if self.do_not_compile or torch.compiler.is_compiling():
|
||||||
# logger.info("SKIPPING COMPILATION")
|
|
||||||
return self.forward(*args, **kwargs)
|
return self.forward(*args, **kwargs)
|
||||||
|
|
||||||
# the first compilation needs to have dynamic shapes marked
|
# the first compilation needs to have dynamic shapes marked
|
||||||
|
|||||||
@ -91,7 +91,6 @@ class DPMetadata:
|
|||||||
|
|
||||||
# If num_tokens_across_dp is None, it will be computed by all_reduce
|
# If num_tokens_across_dp is None, it will be computed by all_reduce
|
||||||
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
|
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
|
||||||
# print(f"num_tokens_across_dp {num_tokens_across_dp} batchsize {batchsize}")
|
|
||||||
assert (num_tokens_across_dp is None
|
assert (num_tokens_across_dp is None
|
||||||
or num_tokens_across_dp[dp_rank] == batchsize)
|
or num_tokens_across_dp[dp_rank] == batchsize)
|
||||||
if num_tokens_across_dp is None:
|
if num_tokens_across_dp is None:
|
||||||
|
|||||||
@ -903,7 +903,7 @@ def fused_topk(
|
|||||||
|
|
||||||
|
|
||||||
# This is used by the Deepseek-V2 and Deepseek-V3 model
|
# This is used by the Deepseek-V2 and Deepseek-V3 model
|
||||||
# @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
|
@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
|
||||||
def grouped_topk(
|
def grouped_topk(
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
gating_output: torch.Tensor,
|
gating_output: torch.Tensor,
|
||||||
|
|||||||
@ -19,7 +19,6 @@ import vllm.envs as envs
|
|||||||
from vllm.distributed.kv_transfer.kv_connector.utils import (
|
from vllm.distributed.kv_transfer.kv_connector.utils import (
|
||||||
get_kv_connector_cache_layout)
|
get_kv_connector_cache_layout)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.v1.worker.block_table import BlockTable
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -32,8 +31,6 @@ class CommonAttentionMetadata:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
query_start_loc: torch.Tensor
|
query_start_loc: torch.Tensor
|
||||||
|
|
||||||
# query_start_loc_cpu: torch.Tensor
|
|
||||||
"""(batch_size + 1,), the start location of each request in query Tensor"""
|
"""(batch_size + 1,), the start location of each request in query Tensor"""
|
||||||
seq_lens: torch.Tensor
|
seq_lens: torch.Tensor
|
||||||
"""(batch_size,), the length of each request including both computed tokens
|
"""(batch_size,), the length of each request including both computed tokens
|
||||||
@ -46,48 +43,6 @@ class CommonAttentionMetadata:
|
|||||||
max_query_len: int
|
max_query_len: int
|
||||||
"""Longest query in batch"""
|
"""Longest query in batch"""
|
||||||
|
|
||||||
# block_table: BlockTable
|
|
||||||
|
|
||||||
# def compute_request_slice(self, token_slice: slice) -> slice:
|
|
||||||
# """
|
|
||||||
# return
|
|
||||||
# - num_decodes: number of decode requests
|
|
||||||
# - num_prefills: number of prefill requests
|
|
||||||
# - num_decode_tokens: number of decode tokens
|
|
||||||
# - num_prefill_tokens: number of prefill tokens
|
|
||||||
# """
|
|
||||||
# if self.max_query_len == 1:
|
|
||||||
# # Pure decode
|
|
||||||
# return token_slice
|
|
||||||
# else:
|
|
||||||
# # Find the first query_start_loc that's greater than the token_slice.start
|
|
||||||
# first_reqest = (self.query_start_loc_cpu >= token_slice.start).int().argmax(dim=-1).item()
|
|
||||||
# last_request = (self.query_start_loc_cpu < token_slice.stop).int().argmax(dim=-1).item()
|
|
||||||
# return slice(first_reqest, last_request)
|
|
||||||
|
|
||||||
# # Slice the current CommonAttentionMetatdata into two
|
|
||||||
# def _slice(self, token_slice: slice) -> CommonAttentionMetadata:
|
|
||||||
# request_slice = self.compute_request_slice(token_slice)
|
|
||||||
# query_start_loc = slice_query_start_locs(
|
|
||||||
# self.query_start_loc, request_slice)
|
|
||||||
|
|
||||||
# seq_lens = self.seq_lens[request_slice]
|
|
||||||
# num_requests = request_slice.stop - request_slice.start
|
|
||||||
# num_actual_tokens = token_slice.stop - token_slice.start
|
|
||||||
# #TODO(Sage) update this for prefill
|
|
||||||
# max_query_len = 1
|
|
||||||
|
|
||||||
# block_table = self.block_table
|
|
||||||
# block_table_tensor = block_table.get_device_tensor()[req_slice]
|
|
||||||
# block_table.slot_mapping[token_slice].copy_(
|
|
||||||
# block_table.slot_mapping_cpu[token_slice],
|
|
||||||
# non_blocking=True)
|
|
||||||
# block_table.slot_mapping[token_slice.stop:].fill_(-1)
|
|
||||||
# slot_mapping = block_table.slot_mapping[token_slice]
|
|
||||||
|
|
||||||
# pass
|
|
||||||
|
|
||||||
|
|
||||||
M = TypeVar("M")
|
M = TypeVar("M")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user