From 22bf5c50779bee221c76ff2f1d168ebd90f1df36 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 11 Oct 2025 11:38:33 -0700 Subject: [PATCH] fix Signed-off-by: Woosuk Kwon --- tests/v1/core/test_async_scheduler.py | 1 - tests/v1/core/test_scheduler.py | 42 ------------------- .../kv_connector/unit/test_nixl_connector.py | 2 - tests/v1/kv_connector/unit/utils.py | 2 - vllm/v1/outputs.py | 3 -- vllm/v1/worker/gpu_model_runner.py | 6 --- vllm/v1/worker/tpu_model_runner.py | 1 - 7 files changed, 57 deletions(-) diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py index 6d870b5640dfb..ebce668febab8 100644 --- a/tests/v1/core/test_async_scheduler.py +++ b/tests/v1/core/test_async_scheduler.py @@ -20,7 +20,6 @@ def _make_model_runner_output( req_ids = list(scheduler_output.num_scheduled_tokens.keys()) return ModelRunnerOutput( req_ids=req_ids, - req_id_to_index={req_id: i for i, req_id in enumerate(req_ids)}, sampled_token_ids=[[i] for i in range(len(req_ids))], logprobs=None, prompt_logprobs_dict={}, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index ff15af70b88bc..44b7ea3e3bfd6 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -163,10 +163,8 @@ def test_schedule_partial_requests(): # The third request is also scheduled partially. # The tokens are not scheduled because of the encoder budget. assert output.num_scheduled_tokens[requests[2].request_id] == 100 - req_to_index = {request.request_id: i for i, request in enumerate(requests)} model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], - req_id_to_index=req_to_index, # Only the first request has a sampled token id because # the rest requests are still being prefilled. sampled_token_ids=[[0], [], []], @@ -212,10 +210,8 @@ def test_no_mm_input_chunking(): # We want to only see the 400 text tokens at the start scheduled assert output.num_scheduled_tokens[requests[0].request_id] == 400 - req_to_index = {request.request_id: i for i, request in enumerate(requests)} model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], - req_id_to_index=req_to_index, sampled_token_ids=[[] for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, @@ -272,10 +268,8 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): assert output.num_scheduled_tokens[requests[1].request_id] == 400 # The third request is also scheduled partially - 1024 - 400 - 400 = 224. assert output.num_scheduled_tokens[requests[2].request_id] == 224 - req_to_index = {request.request_id: i for i, request in enumerate(requests)} model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], - req_id_to_index=req_to_index, sampled_token_ids=[[] for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, @@ -299,7 +293,6 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): # All the remaining tokens in the third request are processed. model_runner_output = ModelRunnerOutput( req_ids=[request.request_id for request in requests], - req_id_to_index=req_to_index, sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)], logprobs=None, prompt_logprobs_dict={}, @@ -347,7 +340,6 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], - req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, sampled_token_ids=[ [EOS_TOKEN_ID], [10, 11], @@ -395,7 +387,6 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], - req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, sampled_token_ids=[[10, 42, 12], [13, 14]], # First request hits stop token logprobs=None, prompt_logprobs_dict={}, @@ -441,7 +432,6 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], - req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, sampled_token_ids=[[10, 11, 12], [13]], # First request exceeds max_tokens logprobs=None, prompt_logprobs_dict={}, @@ -482,7 +472,6 @@ def test_stop_via_update_from_output(): model_output = ModelRunnerOutput( req_ids=[requests[0].request_id], - req_id_to_index={requests[0].request_id: 0}, sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], logprobs=None, prompt_logprobs_dict={}, @@ -623,7 +612,6 @@ def test_schedule_concurrent_batches( # Model output of the first request. model_runner_output = ModelRunnerOutput( req_ids=[requests[0].request_id], - req_id_to_index={requests[0].request_id: 0}, sampled_token_ids=[[0]], logprobs=None, prompt_logprobs_dict={}, @@ -640,7 +628,6 @@ def test_schedule_concurrent_batches( # Model output of the second request. model_runner_output = ModelRunnerOutput( req_ids=[requests[1].request_id], - req_id_to_index={requests[1].request_id: 0}, sampled_token_ids=[[0]], logprobs=None, prompt_logprobs_dict={}, @@ -677,7 +664,6 @@ def test_preempt_during_execution(): # Get the output of the first request. model_runner_output0 = ModelRunnerOutput( req_ids=[requests[0].request_id], - req_id_to_index={requests[0].request_id: 0}, sampled_token_ids=[[0]], logprobs=None, prompt_logprobs_dict={}, @@ -694,7 +680,6 @@ def test_preempt_during_execution(): model_runner_output1 = ModelRunnerOutput( req_ids=[requests[1].request_id], - req_id_to_index={requests[1].request_id: 0}, sampled_token_ids=[[42]], logprobs=None, prompt_logprobs_dict={}, @@ -735,11 +720,9 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens) requests = create_requests(num_requests=len(spec_tokens), num_tokens=1) req_ids = [] - req_to_index = {} for i, request in enumerate(requests): scheduler.add_request(request) req_ids.append(request.request_id) - req_to_index[request.request_id] = i # Schedule a decode, which will also draft speculative tokens output = scheduler.schedule() @@ -752,7 +735,6 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): model_runner_output = ModelRunnerOutput( req_ids=req_ids, - req_id_to_index=req_to_index, sampled_token_ids=[[0] for _ in range(len(requests))], logprobs=None, prompt_logprobs_dict={}, @@ -795,7 +777,6 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): model_runner_output = ModelRunnerOutput( req_ids=req_ids, - req_id_to_index=req_to_index, sampled_token_ids=output_tokens, logprobs=None, prompt_logprobs_dict={}, @@ -927,15 +908,12 @@ def test_kv_connector_basic(): block_size=BLOCK_SIZE, ) req_ids = [] - req_to_index = {} for i, request in enumerate(requests): scheduler.add_request(request) req_ids.append(request.request_id) - req_to_index[request.request_id] = i MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, - req_id_to_index=req_to_index, sampled_token_ids=[[1000]] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, @@ -977,15 +955,12 @@ def test_kv_connector_basic(): block_size=BLOCK_SIZE, ) req_ids = [] - req_to_index = {} for i, request in enumerate(requests): scheduler.add_request(request) req_ids.append(request.request_id) - req_to_index[request.request_id] = i MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, - req_id_to_index=req_to_index, sampled_token_ids=[[1000]] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, @@ -1052,15 +1027,12 @@ def test_kv_connector_unable_to_allocate(): block_size=BLOCK_SIZE, ) req_ids = [] - req_to_index = {} for i, request in enumerate(requests): scheduler.add_request(request) req_ids.append(request.request_id) - req_to_index[request.request_id] = i MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, - req_id_to_index=req_to_index, sampled_token_ids=[[1000]] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, @@ -1137,15 +1109,12 @@ def test_kv_connector_handles_preemption(): block_size=BLOCK_SIZE, ) req_ids = [] - req_to_index = {} for i, request in enumerate(requests): scheduler.add_request(request) req_ids.append(request.request_id) - req_to_index[request.request_id] = i MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=req_ids, - req_id_to_index=req_to_index, sampled_token_ids=[[1000]] * len(req_ids), logprobs=None, prompt_logprobs_dict={}, @@ -1238,7 +1207,6 @@ def test_kv_connector_handles_preemption(): def make_output(scheduler: Scheduler): return ModelRunnerOutput( req_ids=[req.request_id for req in scheduler.running], - req_id_to_index={req.request_id: i for i, req in enumerate(scheduler.running)}, sampled_token_ids=[[1000]] * len(scheduler.running), logprobs=None, prompt_logprobs_dict={}, @@ -1586,9 +1554,6 @@ def test_priority_scheduling_preemption(): # Simulate model execution to move requests to running state model_output = ModelRunnerOutput( req_ids=[req.request_id for req in low_priority_requests], - req_id_to_index={ - req.request_id: i for i, req in enumerate(low_priority_requests) - }, sampled_token_ids=[[100] for _ in low_priority_requests], logprobs=None, prompt_logprobs_dict={}, @@ -1655,9 +1620,6 @@ def test_priority_scheduling_no_preemption_when_space_available(): output = scheduler.schedule() model_output = ModelRunnerOutput( req_ids=[req.request_id for req in low_priority_requests], - req_id_to_index={ - req.request_id: i for i, req in enumerate(low_priority_requests) - }, sampled_token_ids=[[100] for _ in low_priority_requests], logprobs=None, prompt_logprobs_dict={}, @@ -1903,7 +1865,6 @@ def test_priority_scheduling_heap_property(): # Simulate completion to make room for next request model_output = ModelRunnerOutput( req_ids=[req.req_id], - req_id_to_index={req.req_id: 0}, sampled_token_ids=[[100]], logprobs=None, prompt_logprobs_dict={}, @@ -1983,7 +1944,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(): # Simulate model execution - 1st decode model_output = ModelRunnerOutput( req_ids=[request_low.request_id], - req_id_to_index={request_low.request_id: 0}, sampled_token_ids=[[100]], # spec_token_ids=None, logprobs=None, @@ -2014,7 +1974,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(): requests = [request_low, request_high] model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], - req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, sampled_token_ids=[[100] for _ in requests], # spec_token_ids=None, logprobs=None, @@ -2040,7 +1999,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(): # Simulate model execution - 3rd decode model_output = ModelRunnerOutput( req_ids=[req.request_id for req in requests], - req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, sampled_token_ids=[[], [100]], # spec_token_ids=None, logprobs=None, diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index a1f53cb255630..396f9ead7f939 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -680,7 +680,6 @@ def test_kv_connector_stats_aggregation(): for i, worker_stats in enumerate([worker1_stats, worker2_stats, worker3_stats]): output = ModelRunnerOutput( req_ids=[f"req_{i}"], - req_id_to_index={f"req_{i}": 0}, sampled_token_ids=[[123]], # dummy token logprobs=None, prompt_logprobs_dict={}, @@ -761,7 +760,6 @@ def test_multi_kv_connector_stats_aggregation(): stats = make_multi_stats(nixl, foo) output = ModelRunnerOutput( req_ids=[f"req_{i}"], - req_id_to_index={f"req_{i}": 0}, sampled_token_ids=[[123]], logprobs=None, prompt_logprobs_dict={}, diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 6f51b9bbcbdaa..7f510d5f63415 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -215,7 +215,6 @@ def create_model_runner_output( # Make request data. req_ids = [req.request_id for req in reqs] - req_id_to_index = {req_id: idx for idx, req_id in enumerate(req_ids)} # Make sampled tokens. sampled_token = EOS_TOKEN_ID if use_eos else token_id @@ -238,7 +237,6 @@ def create_model_runner_output( # Make output data structure. return ModelRunnerOutput( req_ids=req_ids, - req_id_to_index=req_id_to_index, sampled_token_ids=sampled_token_ids, logprobs=None, prompt_logprobs_dict={}, diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index d647b207575cf..11666fa75256b 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -102,8 +102,6 @@ class KVConnectorOutput: class ModelRunnerOutput: # [num_reqs] req_ids: list[str] - # req_id -> index - req_id_to_index: dict[str, int] # num_reqs x num_generated_tokens # num_generated_tokens is the number of tokens @@ -154,7 +152,6 @@ class DraftTokenIds: EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=[], - req_id_to_index={}, sampled_token_ids=[], logprobs=None, prompt_logprobs_dict={}, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f9bf6b720e400..5df46ab9f9363 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2059,7 +2059,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # NOTE(woosuk): input_batch.req_ids may include requests that are # not scheduled in this step. Therefore, we truncate it here. req_ids=self.input_batch.req_ids[: self.input_batch.num_reqs], - req_id_to_index=self.input_batch.req_id_to_index, sampled_token_ids=[], logprobs=None, prompt_logprobs_dict={}, @@ -2254,7 +2253,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): list[list[int]], dict[str, Optional[LogprobsTensors]], list[str], - dict[str, int], list[int], ]: num_nans_in_logits = {} @@ -2275,7 +2273,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # not scheduled in this step. Therefore, we truncate it here. num_reqs = self.input_batch.num_reqs req_ids_output_copy = self.input_batch.req_ids[:num_reqs].copy() - req_id_to_index_output_copy = self.input_batch.req_id_to_index.copy() # NOTE: GPU -> CPU Sync happens here. # Move as many CPU operations as possible before this sync point. @@ -2361,7 +2358,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): valid_sampled_token_ids, prompt_logprobs_dict, req_ids_output_copy, - req_id_to_index_output_copy, invalid_req_indices, ) @@ -2631,7 +2627,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): valid_sampled_token_ids, prompt_logprobs_dict, req_ids_output_copy, - req_id_to_index_output_copy, invalid_req_indices, ) = self._bookkeeping_sync( scheduler_output, @@ -2655,7 +2650,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): output = ModelRunnerOutput( req_ids=req_ids_output_copy, - req_id_to_index=req_id_to_index_output_copy, sampled_token_ids=valid_sampled_token_ids, logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index f9e1fcedc8903..ce60e70c346ab 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1266,7 +1266,6 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model_runner_output = ModelRunnerOutput( req_ids=req_ids, - req_id_to_index=self.input_batch.req_id_to_index, sampled_token_ids=valid_sampled_token_ids, logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict,