mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-31 10:47:07 +08:00
reorder funcs
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
This commit is contained in:
parent
c715fb19e5
commit
0bddb6b9a5
@ -79,33 +79,6 @@ class ModelRunnerBase:
|
|||||||
assert self.model is not None
|
assert self.model is not None
|
||||||
return self.model
|
return self.model
|
||||||
|
|
||||||
def execute_model(
|
|
||||||
self,
|
|
||||||
scheduler_output: "SchedulerOutput",
|
|
||||||
) -> ModelRunnerOutput:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def load_model(self) -> None:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def dummy_run(
|
|
||||||
self,
|
|
||||||
kv_caches,
|
|
||||||
num_tokens: int,
|
|
||||||
seq_len: Optional[int] = None,
|
|
||||||
exec_mode: Optional[ExecutionMode] = None,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def profile_run(self) -> None:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def capture_model(self) -> None:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def get_kv_cache_spec(self) -> KVCacheSpec:
|
def get_kv_cache_spec(self) -> KVCacheSpec:
|
||||||
"""
|
"""
|
||||||
Generates the KVCacheSpec by parsing the kv cache format from each
|
Generates the KVCacheSpec by parsing the kv cache format from each
|
||||||
@ -140,3 +113,30 @@ class ModelRunnerBase:
|
|||||||
f"Unknown attention type: {attn_module.attn_type}")
|
f"Unknown attention type: {attn_module.attn_type}")
|
||||||
|
|
||||||
return kv_cache_spec
|
return kv_cache_spec
|
||||||
|
|
||||||
|
def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def execute_model(
|
||||||
|
self,
|
||||||
|
scheduler_output: "SchedulerOutput",
|
||||||
|
) -> ModelRunnerOutput:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def load_model(self) -> None:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def dummy_run(
|
||||||
|
self,
|
||||||
|
kv_caches,
|
||||||
|
num_tokens: int,
|
||||||
|
seq_len: Optional[int] = None,
|
||||||
|
exec_mode: Optional[ExecutionMode] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def profile_run(self) -> None:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def capture_model(self) -> None:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|||||||
@ -80,16 +80,10 @@ class WorkerBase:
|
|||||||
# Initialized by the specific platform
|
# Initialized by the specific platform
|
||||||
self.model_runner: Optional[ModelRunnerBase] = None
|
self.model_runner: Optional[ModelRunnerBase] = None
|
||||||
|
|
||||||
def init_device(self):
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def load_model(self) -> None:
|
def load_model(self) -> None:
|
||||||
assert self.model_runner is not None
|
assert self.model_runner is not None
|
||||||
self.model_runner.load_model()
|
self.model_runner.load_model()
|
||||||
|
|
||||||
def determine_available_memory(self) -> int:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def compile_or_warm_up_model(self) -> None:
|
def compile_or_warm_up_model(self) -> None:
|
||||||
assert self.model_runner is not None
|
assert self.model_runner is not None
|
||||||
|
|
||||||
@ -113,12 +107,6 @@ class WorkerBase:
|
|||||||
assert self.model_runner is not None
|
assert self.model_runner is not None
|
||||||
self.model_runner.initialize_kv_cache(kv_cache_config)
|
self.model_runner.initialize_kv_cache(kv_cache_config)
|
||||||
|
|
||||||
def execute_model(
|
|
||||||
self,
|
|
||||||
scheduler_output: "SchedulerOutput",
|
|
||||||
) -> Optional[ModelRunnerOutput]:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def profile(self, is_start: bool = True):
|
def profile(self, is_start: bool = True):
|
||||||
if self.profiler is None:
|
if self.profiler is None:
|
||||||
raise RuntimeError("Profiler is not enabled.")
|
raise RuntimeError("Profiler is not enabled.")
|
||||||
@ -131,6 +119,18 @@ class WorkerBase:
|
|||||||
# worker will always be healthy as long as it's running.
|
# worker will always be healthy as long as it's running.
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def init_device(self):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def determine_available_memory(self) -> int:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def execute_model(
|
||||||
|
self,
|
||||||
|
scheduler_output: "SchedulerOutput",
|
||||||
|
) -> Optional[ModelRunnerOutput]:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
def check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
|
def check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
|
||||||
# Check if the GPU supports the dtype.
|
# Check if the GPU supports the dtype.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user