From a8675b7d9886dc690a8a2db85fcc2d4d91af8f03 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Thu, 5 Jun 2025 14:25:21 +0000 Subject: [PATCH] ubatch padding should work now Signed-off-by: Sage Moore --- vllm/v1/worker/gpu_model_runner.py | 42 ++++++++++++++++-------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a3d4ec4e64a3d..98e3835e6dbeb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1231,9 +1231,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): dtype=torch.int32) return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding - def pad_ubatch(self, target_num_tokens, ubatch_slice: UbatchSlice): - pass - def get_dp_padding_ubatch(self, ubatch_slices: UBatchSlices) -> tuple[int, Optional[torch.Tensor]]: dp_size = self.vllm_config.parallel_config.data_parallel_size @@ -1245,7 +1242,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): first_ubatch_num_tokens = first_ubatch_slice[1].stop - first_ubatch_slice[1].start second_ubatch_num_tokens = second_ubatch_slice[1].stop - second_ubatch_slice[1].start - num_tokens = max(first_ubatch_num_tokens, second_ubatch_num_tokens) + max_tokens_per_ubatch = max(first_ubatch_num_tokens, second_ubatch_num_tokens) # For DP: Don't pad when setting enforce_eager. # This lets us set enforce_eager on the prefiller in a P/D setup and @@ -1259,19 +1256,23 @@ class GPUModelRunner(LoRAModelRunnerMixin): return 0, None num_tokens_across_dp = DPMetadata.num_tokens_across_dp( - num_tokens, dp_size, dp_rank) + max_tokens_per_ubatch, dp_size, dp_rank) max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item() num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] * dp_size, device="cpu", dtype=torch.int32) - # Note that this num_pad_tokens will actually - # be the number of tokens added to each ubatch. - # Meaning 2*num_pad_tokens are added to each DP rank - num_pad_tokens = max_tokens_across_dp_cpu - num_tokens - self.pad_ubatch(num_pad_tokens, first_ubatch_slice) - self.pad_ubatch(num_pad_tokens, second_ubatch_slice) - return num_pad_tokens, num_tokens_after_padding + + num_pad_tokens_first_ubatch = max_tokens_across_dp_cpu - first_ubatch_num_tokens + num_pad_tokens_second_ubatch = max_tokens_across_dp_cpu - second_ubatch_num_tokens + + padded_first_ubatch_slice = slice(0, max_tokens_across_dp_cpu) + padded_second_ubatch_slice = slice(max_tokens_across_dp_cpu, 2 * max_tokens_across_dp_cpu) + + ubatch_slices[0] = (ubatch_slices[0][0], padded_first_ubatch_slice) + ubatch_slices[1] = (ubatch_slices[1][0], padded_second_ubatch_slice) + + return num_pad_tokens_first_ubatch + num_pad_tokens_second_ubatch, num_tokens_after_padding def should_ubatch(self, should_ubatch: bool) -> bool: dp_size = self.vllm_config.parallel_config.data_parallel_size @@ -1513,17 +1514,20 @@ class GPUModelRunner(LoRAModelRunnerMixin): self._prepare_inputs(scheduler_output)) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - num_pad_tokens, num_tokens_after_padding = \ - self.get_dp_padding_ubatch(ubatch_slices) - num_scheduled_tokens += num_pad_tokens + num_tokens_after_padding = None + if ubatch_slices: + num_pad_tokens, num_tokens_after_padding = \ + self.get_dp_padding_ubatch(ubatch_slices) + num_scheduled_tokens += num_pad_tokens # Run the decoder. # Use persistent buffers for CUDA graphs. self.maybe_setup_kv_connector(scheduler_output) model_output = self._run_model( - attn_metadata, - num_scheduled_tokens, - ubatch_slices, - scheduler_output, + attn_metadata=attn_metadata, + num_scheduled_tokens=num_scheduled_tokens, + ubatch_slices=ubatch_slices, + scheduler_output=scheduler_output, + num_tokens_across_dp=num_tokens_after_padding ) self.maybe_wait_for_kv_save() finished_sending, finished_recving = (