diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 51a964817505a..79f4300d4f566 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -599,7 +599,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): if not should_ubatch: return (None, 0, None) - # For pure decode we can just create ubatchs by cutting the request + # For pure decode we can just create ubatches by cutting the request # in half b0_reqs_end = num_reqs // 2 b0_tokens_end = total_num_scheduled_tokens // 2 @@ -610,6 +610,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): (slice(b0_reqs_end, num_reqs), slice(b0_tokens_end, total_num_scheduled_tokens)), ] + + # Compute ubatch padding. This currently only accounts for DP padding num_pad_tokens = 0 num_tokens_after_padding = None ubatch_abort = False @@ -817,19 +819,16 @@ class GPUModelRunner(LoRAModelRunnerMixin): if ubatch_slices is not None: for ubid, (req_slice, token_slice) in enumerate(ubatch_slices): - # Run a dummy batch if its a empty ubatch - if token_slice.stop <= token_slice.start: - attn_metadata_i = None - else: - attn_metadata_i = ( - self.attn_metadata_builders[kv_cache_group_id]. - build_slice( - req_slice=req_slice, - token_slice=token_slice, - max_query_len=max(tokens[req_slice]), - common_prefix_len=common_prefix_len, - common_attn_metadata=common_attn_metadata, - )) + assert token_slice.stop > token_slice.start + attn_metadata_i = ( + self.attn_metadata_builders[kv_cache_group_id]. + build_slice( + req_slice=req_slice, + token_slice=token_slice, + max_query_len=max(tokens[req_slice]), + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata, + )) for layer_name in kv_cache_group_spec.layer_names: assert type(attn_metadata) is list attn_metadata[ubid][layer_name] = attn_metadata_i @@ -1416,7 +1415,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ubatch_slices[1] = (padded_second_ubatch_slice, padded_second_ubatch_slice) # This is where the second ubatch is adjusted to account for the padding. - # Should be called after attention metadata creation. This just extends + # Should be called after attention metadata creation. This just pads # the second ubatch slice out to the total number of tokens # (num_tokens + padding) def pad_out_ubatch_second_stage(self, ubatch_slices: UBatchSlices, num_total_tokens: int):