From a4962833f976973848e0267899e0ef1edc1ed30c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 18 Sep 2025 13:20:37 -0700 Subject: [PATCH] minor Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index e100f299cd264..22015d2680b30 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -51,9 +51,8 @@ class GPUModelRunner: self.device = device self.pin_memory = is_pin_memory_available() self.dtype = self.model_config.dtype - if self.cache_config.cache_dtype == "auto": - self.kv_cache_dtype = self.dtype - else: + self.kv_cache_dtype = self.dtype + if self.cache_config.cache_dtype != "auto": # Quantized KV cache. self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ self.cache_config.cache_dtype] @@ -99,9 +98,6 @@ class GPUModelRunner: def profile_run(self): pass - def maybe_remove_all_loras(self, lora_config): - pass - def get_kv_cache_spec(self): return get_kv_cache_spec(self.vllm_config, self.kv_cache_dtype) @@ -269,6 +265,7 @@ class GPUModelRunner: slot_mappings = self.block_tables.compute_slot_mappings( query_start_loc_gpu, positions.gpu[:num_tokens]) logits_indices = query_start_loc_gpu[1:] - 1 + num_logits_indices = logits_indices.size(0) # Layer name -> attention metadata. attn_metadata: dict[str, Any] = {} @@ -290,7 +287,7 @@ class GPUModelRunner: block_table_tensor=block_table, slot_mapping=slot_mapping, logits_indices_padded=None, - num_logits_indices=logits_indices.size(0), + num_logits_indices=num_logits_indices, causal=True, encoder_seq_lens=None, )