diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index aabc9ed9b80a2..f45a94f3151b6 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -71,6 +71,10 @@ class UniProcExecutor(ExecutorBase): self.shutdown() return + def shutdown(self) -> None: + if worker := self.driver_worker: + worker.shutdown() + UniProcExecutorAsync = UniProcExecutor diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b75756fbdae85..ce53154896bae 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2070,7 +2070,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): sampler_output = self._sample(logits, spec_decode_metadata) with record_function_or_nullcontext("Bookkeep"): - assert isinstance(hidden_states, torch.Tensor) ( num_nans_in_logits, logprobs_lists, diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index 67bb967d2edfa..3eb9f26e9f5b6 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -45,7 +45,8 @@ class KVConnectorModelRunnerMixin: @staticmethod def ensure_kv_transfer_shutdown() -> None: - if has_kv_transfer_group(): + # has_kv_transfer_group can be None during interpreter shutdown. + if has_kv_transfer_group and has_kv_transfer_group(): ensure_kv_transfer_shutdown() @staticmethod