mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-31 06:37:04 +08:00
minor
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
67d8c0c21b
commit
a98eff0762
@ -1,5 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.attention.backends.abstract import AttentionBackend, AttentionType
|
from vllm.attention.backends.abstract import AttentionBackend, AttentionType
|
||||||
@ -8,6 +10,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config
|
|||||||
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
|
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
|
||||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||||
KVCacheSpec, SlidingWindowSpec)
|
KVCacheSpec, SlidingWindowSpec)
|
||||||
|
from vllm.v1.worker.utils import bind_kv_cache
|
||||||
|
|
||||||
|
|
||||||
def get_kv_cache_spec(
|
def get_kv_cache_spec(
|
||||||
@ -124,6 +127,8 @@ def _reshape_kv_cache(
|
|||||||
|
|
||||||
|
|
||||||
def init_kv_cache(
|
def init_kv_cache(
|
||||||
|
runner_kv_caches: list[torch.Tensor],
|
||||||
|
forward_context: dict[str, Any],
|
||||||
kv_cache_config: KVCacheConfig,
|
kv_cache_config: KVCacheConfig,
|
||||||
attn_backends: dict[str, AttentionBackend],
|
attn_backends: dict[str, AttentionBackend],
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
@ -131,4 +136,4 @@ def init_kv_cache(
|
|||||||
kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device)
|
kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device)
|
||||||
kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors,
|
kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors,
|
||||||
attn_backends)
|
attn_backends)
|
||||||
return kv_caches
|
bind_kv_cache(forward_context, kv_caches, runner_kv_caches)
|
||||||
|
|||||||
@ -25,7 +25,6 @@ from vllm.v1.worker.gpu.input_batch import (InputBatch, InputBuffers,
|
|||||||
prepare_inputs)
|
prepare_inputs)
|
||||||
from vllm.v1.worker.gpu.sampler import Sampler
|
from vllm.v1.worker.gpu.sampler import Sampler
|
||||||
from vllm.v1.worker.gpu.states import RequestState
|
from vllm.v1.worker.gpu.states import RequestState
|
||||||
from vllm.v1.worker.utils import bind_kv_cache
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -98,9 +97,6 @@ class GPUModelRunner:
|
|||||||
m.consumed_memory / GiB_bytes,
|
m.consumed_memory / GiB_bytes,
|
||||||
time_after_load - time_before_load)
|
time_after_load - time_before_load)
|
||||||
|
|
||||||
def profile_run(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def get_kv_cache_spec(self):
|
def get_kv_cache_spec(self):
|
||||||
return get_kv_cache_spec(self.vllm_config, self.kv_cache_dtype)
|
return get_kv_cache_spec(self.vllm_config, self.kv_cache_dtype)
|
||||||
|
|
||||||
@ -124,17 +120,14 @@ class GPUModelRunner:
|
|||||||
self.device,
|
self.device,
|
||||||
)
|
)
|
||||||
|
|
||||||
kv_caches = init_kv_cache(
|
self.kv_caches: list[torch.Tensor] = []
|
||||||
|
init_kv_cache(
|
||||||
|
self.kv_caches,
|
||||||
|
self.compilation_config.static_forward_context,
|
||||||
self.kv_cache_config,
|
self.kv_cache_config,
|
||||||
self.attn_backends,
|
self.attn_backends,
|
||||||
self.device,
|
self.device,
|
||||||
)
|
)
|
||||||
self.kv_caches: list[torch.Tensor] = []
|
|
||||||
bind_kv_cache(
|
|
||||||
kv_caches,
|
|
||||||
self.compilation_config.static_forward_context,
|
|
||||||
self.kv_caches,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _dummy_run(self, num_tokens: int, *args, **kwargs) -> None:
|
def _dummy_run(self, num_tokens: int, *args, **kwargs) -> None:
|
||||||
return None, None
|
return None, None
|
||||||
@ -143,6 +136,9 @@ class GPUModelRunner:
|
|||||||
**kwargs) -> None:
|
**kwargs) -> None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def profile_run(self):
|
||||||
|
pass
|
||||||
|
|
||||||
def update_states(self, scheduler_output: SchedulerOutput) -> None:
|
def update_states(self, scheduler_output: SchedulerOutput) -> None:
|
||||||
# for req_id in scheduler_output.preempted_req_ids:
|
# for req_id in scheduler_output.preempted_req_ids:
|
||||||
# self.req_states.remove_request(req_id)
|
# self.req_states.remove_request(req_id)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user