diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index e351f0e925250..73c0da45d4ab3 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -226,7 +226,7 @@ def test_update_states_request_resumed(model_runner): req_id=req_id, resumed_from_preemption=False, new_token_ids=[], - new_block_ids=[], + new_block_ids=[[]], num_computed_tokens=0, ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a90c294a97493..e3535ef143ada 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -460,8 +460,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Update the block IDs. if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - for i in range(len(self.kv_cache_config.kv_cache_groups)): - req_state.block_ids[i].extend(req_data.new_block_ids[i]) + for block_ids, new_block_ids in zip( # type: ignore[call-overload] + req_state.block_ids, + req_data.new_block_ids, + strict=True): + block_ids.extend(new_block_ids) else: # The request is resumed from preemption. # Replace the existing block IDs with the new ones. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 843bc36953b57..d5f40e4d3103c 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -413,7 +413,11 @@ class TPUModelRunner(LoRAModelRunnerMixin): req_state.num_computed_tokens = req_data.num_computed_tokens if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - req_state.block_ids.extend(req_data.new_block_ids) + for block_ids, new_block_ids in zip( # type: ignore[call-overload] + req_state.block_ids, + req_data.new_block_ids, + strict=True): + block_ids.extend(new_block_ids) else: # The request is resumed from preemption. # Replace the existing block IDs with the new ones.