diff --git a/vllm/v1/worker/model_runner_base.py b/vllm/v1/worker/model_runner_base.py index e46242cb13841..f45b24def7843 100644 --- a/vllm/v1/worker/model_runner_base.py +++ b/vllm/v1/worker/model_runner_base.py @@ -79,33 +79,6 @@ class ModelRunnerBase: assert self.model is not None return self.model - def execute_model( - self, - scheduler_output: "SchedulerOutput", - ) -> ModelRunnerOutput: - raise NotImplementedError() - - def load_model(self) -> None: - raise NotImplementedError() - - def dummy_run( - self, - kv_caches, - num_tokens: int, - seq_len: Optional[int] = None, - exec_mode: Optional[ExecutionMode] = None, - ) -> torch.Tensor: - raise NotImplementedError() - - def profile_run(self) -> None: - raise NotImplementedError() - - def capture_model(self) -> None: - raise NotImplementedError() - - def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: - raise NotImplementedError() - def get_kv_cache_spec(self) -> KVCacheSpec: """ Generates the KVCacheSpec by parsing the kv cache format from each @@ -140,3 +113,30 @@ class ModelRunnerBase: f"Unknown attention type: {attn_module.attn_type}") return kv_cache_spec + + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + raise NotImplementedError() + + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> ModelRunnerOutput: + raise NotImplementedError() + + def load_model(self) -> None: + raise NotImplementedError() + + def dummy_run( + self, + kv_caches, + num_tokens: int, + seq_len: Optional[int] = None, + exec_mode: Optional[ExecutionMode] = None, + ) -> torch.Tensor: + raise NotImplementedError() + + def profile_run(self) -> None: + raise NotImplementedError() + + def capture_model(self) -> None: + raise NotImplementedError() diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index d72f67b3a4d77..82260b9449f31 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -80,16 +80,10 @@ class WorkerBase: # Initialized by the specific platform self.model_runner: Optional[ModelRunnerBase] = None - def init_device(self): - raise NotImplementedError() - def load_model(self) -> None: assert self.model_runner is not None self.model_runner.load_model() - def determine_available_memory(self) -> int: - raise NotImplementedError() - def compile_or_warm_up_model(self) -> None: assert self.model_runner is not None @@ -113,12 +107,6 @@ class WorkerBase: assert self.model_runner is not None self.model_runner.initialize_kv_cache(kv_cache_config) - def execute_model( - self, - scheduler_output: "SchedulerOutput", - ) -> Optional[ModelRunnerOutput]: - raise NotImplementedError() - def profile(self, is_start: bool = True): if self.profiler is None: raise RuntimeError("Profiler is not enabled.") @@ -131,6 +119,18 @@ class WorkerBase: # worker will always be healthy as long as it's running. return + def init_device(self): + raise NotImplementedError() + + def determine_available_memory(self) -> int: + raise NotImplementedError() + + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> Optional[ModelRunnerOutput]: + raise NotImplementedError() + def check_if_gpu_supports_dtype(torch_dtype: torch.dtype): # Check if the GPU supports the dtype.