diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 4388b3c1ee164..1bc5f65c7127f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1782,6 +1782,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): kv_caches: vLLM's paged memory """ + if self.vllm_config.kv_transfer_config is None: + return False + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling @@ -1789,9 +1792,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): # check if the current run is prefill is_prefill_run = prefill_meta is not None - if self.vllm_config.kv_transfer_config is None: - return False - return self.vllm_config.kv_transfer_config.is_kv_consumer and ( not is_profile_run) and is_prefill_run @@ -1807,6 +1807,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): kv_caches: vLLM's paged memory """ + if self.vllm_config.kv_transfer_config is None: + return False + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling @@ -1814,9 +1817,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): # check if the current run is prefill is_prefill_run = prefill_meta is not None - if self.vllm_config.kv_transfer_config is None: - return False - return self.vllm_config.kv_transfer_config.is_kv_producer and ( not is_profile_run) and is_prefill_run