diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index b08898d253cab..ef77d9e2d3ffd 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -578,8 +578,10 @@ class Scheduler(SchedulerInterface): scheduled_spec_decode_tokens, req_to_new_blocks, ) + scheduled_requests = (scheduled_new_reqs + scheduled_running_reqs + + scheduled_resumed_reqs) structured_output_request_ids, grammar_bitmask = ( - self.get_grammar_bitmask(self.running, + self.get_grammar_bitmask(scheduled_requests, scheduled_spec_decode_tokens)) scheduler_output = SchedulerOutput( scheduled_new_reqs=new_reqs_data, diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 127c8876525b5..b9b09bea1e80f 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -90,13 +90,14 @@ def apply_grammar_bitmask( seq = sorted(scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1]) for req_id, _ in seq: - logit_index = struct_out_req_batch_indices[req_id] num_spec_tokens = len( scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) - for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + i] = \ - grammar_bitmask[cumulative_index + i] - out_indices.append(logit_index + i) + if req_id in struct_out_req_batch_indices: + logit_index = struct_out_req_batch_indices[req_id] + for i in range(1 + num_spec_tokens): + sorted_bitmask[logit_index + i] = \ + grammar_bitmask[cumulative_index + i] + out_indices.append(logit_index + i) cumulative_index += 1 + num_spec_tokens grammar_bitmask = sorted_bitmask