mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-27 23:27:20 +08:00
misc padding fixes
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
a00dabcb33
commit
05ddc34913
@ -43,7 +43,9 @@ class DPMetadata:
|
|||||||
device="cpu",
|
device="cpu",
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
from vllm.distributed.parallel_state import get_dp_group
|
from vllm.distributed.parallel_state import get_dp_group
|
||||||
|
print("STARTING AR")
|
||||||
dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
|
dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
|
||||||
|
print("finishing")
|
||||||
return num_tokens_tensor
|
return num_tokens_tensor
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -54,7 +56,9 @@ class DPMetadata:
|
|||||||
device="cpu",
|
device="cpu",
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
from vllm.distributed.parallel_state import get_dp_group
|
from vllm.distributed.parallel_state import get_dp_group
|
||||||
|
print("Starting AR")
|
||||||
dist.all_reduce(should_ubatch_tensor, group=get_dp_group().cpu_group)
|
dist.all_reduce(should_ubatch_tensor, group=get_dp_group().cpu_group)
|
||||||
|
print("FINISHING AR")
|
||||||
result: bool = bool(torch.all(should_ubatch_tensor == 1).item())
|
result: bool = bool(torch.all(should_ubatch_tensor == 1).item())
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -80,6 +84,7 @@ class DPMetadata:
|
|||||||
|
|
||||||
# If num_tokens_across_dp is None, it will be computed by all_reduce
|
# If num_tokens_across_dp is None, it will be computed by all_reduce
|
||||||
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
|
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
|
||||||
|
# print(f"num_tokens_across_dp {num_tokens_across_dp} batchsize {batchsize}")
|
||||||
assert (num_tokens_across_dp is None
|
assert (num_tokens_across_dp is None
|
||||||
or num_tokens_across_dp[dp_rank] == batchsize)
|
or num_tokens_across_dp[dp_rank] == batchsize)
|
||||||
if num_tokens_across_dp is None:
|
if num_tokens_across_dp is None:
|
||||||
|
|||||||
@ -123,8 +123,6 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
|||||||
do_recv=not send,
|
do_recv=not send,
|
||||||
)
|
)
|
||||||
|
|
||||||
ubatch_ctx = get_current_ubatch_context()
|
|
||||||
ubatch_id = ubatch_ctx.id if ubatch_ctx is not None else -1
|
|
||||||
# yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
# yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||||
dispatch(True) # Send
|
dispatch(True) # Send
|
||||||
# torch.cuda.synchronize()
|
# torch.cuda.synchronize()
|
||||||
|
|||||||
@ -573,7 +573,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
def _prepare_inputs(
|
def _prepare_inputs(
|
||||||
self, scheduler_output: "SchedulerOutput"
|
self, scheduler_output: "SchedulerOutput"
|
||||||
) -> tuple[PerLayerAttnMetadata, torch.Tensor,
|
) -> tuple[PerLayerAttnMetadata, torch.Tensor,
|
||||||
Optional[SpecDecodeMetadata], Optional[UBatchSlices]]:
|
Optional[SpecDecodeMetadata], Optional[UBatchSlices],
|
||||||
|
int, Optional[torch.Tensor]]:
|
||||||
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||||
assert total_num_scheduled_tokens > 0
|
assert total_num_scheduled_tokens > 0
|
||||||
num_reqs = self.input_batch.num_reqs
|
num_reqs = self.input_batch.num_reqs
|
||||||
@ -661,6 +662,14 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
if not should_ubatch and ubatch_slices:
|
if not should_ubatch and ubatch_slices:
|
||||||
ubatch_slices = None
|
ubatch_slices = None
|
||||||
|
|
||||||
|
num_pad_tokens = 0
|
||||||
|
num_tokens_after_padding = None
|
||||||
|
if ubatch_slices:
|
||||||
|
num_pad_tokens, num_tokens_after_padding = self.get_dp_padding_ubatch(ubatch_slices)
|
||||||
|
if num_pad_tokens > 0:
|
||||||
|
self.pad_out_ubatch_first_stage(ubatch_slices, num_pad_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
self.seq_lens_np[:num_reqs] = (
|
self.seq_lens_np[:num_reqs] = (
|
||||||
self.input_batch.num_computed_tokens_cpu[:num_reqs] +
|
self.input_batch.num_computed_tokens_cpu[:num_reqs] +
|
||||||
@ -782,7 +791,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
self.set_active_loras(self.input_batch, num_scheduled_tokens)
|
self.set_active_loras(self.input_batch, num_scheduled_tokens)
|
||||||
|
|
||||||
return (attn_metadata, logits_indices, spec_decode_metadata,
|
return (attn_metadata, logits_indices, spec_decode_metadata,
|
||||||
ubatch_slices)
|
ubatch_slices, num_pad_tokens, num_tokens_after_padding)
|
||||||
|
|
||||||
def _compute_cascade_attn_prefix_len(
|
def _compute_cascade_attn_prefix_len(
|
||||||
self,
|
self,
|
||||||
@ -1261,30 +1270,42 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
device="cpu",
|
device="cpu",
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
|
|
||||||
padded_first_ubatch_slice = slice(0, max_tokens_across_dp)
|
|
||||||
padded_second_ubatch_slice = slice(max_tokens_across_dp, max_tokens_across_dp * 2)
|
|
||||||
|
|
||||||
assert max_tokens_across_dp <= 2 * max_tokens_per_ubatch_local, \
|
assert max_tokens_across_dp <= 2 * max_tokens_per_ubatch_local, \
|
||||||
f"max_tokens_across_dp: {max_tokens_across_dp} max_tokens_per_ubatch{max_tokens_per_ubatch_local}"
|
f"max_tokens_across_dp: {max_tokens_across_dp} max_tokens_per_ubatch{max_tokens_per_ubatch_local}"
|
||||||
|
num_pad_tokens = (max_tokens_across_dp * 2) - \
|
||||||
|
(first_ubatch_num_tokens + second_ubatch_num_tokens)
|
||||||
|
return num_pad_tokens, num_tokens_after_padding
|
||||||
|
|
||||||
assert padded_first_ubatch_slice.stop - padded_first_ubatch_slice.start == \
|
# This doesn't actually pad the ubatch slices. It just shifts the
|
||||||
padded_second_ubatch_slice.stop - padded_second_ubatch_slice.start
|
# split point to the correct value so that padding can be applied
|
||||||
|
# to the second ubatch later. Should be called after ubatch
|
||||||
|
# slicing but before attention meta data creation
|
||||||
|
def pad_out_ubatch_first_stage(self, ubatch_slices: UBatchSlices,
|
||||||
|
num_pad_tokens: int):
|
||||||
|
original_num_tokens = ubatch_slices[1][1].stop
|
||||||
|
assert num_pad_tokens < original_num_tokens
|
||||||
|
total_num_tokens_per_ubatch = (original_num_tokens + num_pad_tokens) // 2
|
||||||
|
padded_first_ubatch_slice = slice(0, total_num_tokens_per_ubatch)
|
||||||
|
padded_second_ubatch_slice = slice(total_num_tokens_per_ubatch, original_num_tokens)
|
||||||
|
|
||||||
ubatch_slices[0] = (padded_first_ubatch_slice, padded_first_ubatch_slice)
|
ubatch_slices[0] = (padded_first_ubatch_slice, padded_first_ubatch_slice)
|
||||||
ubatch_slices[1] = (padded_first_ubatch_slice, padded_second_ubatch_slice)
|
ubatch_slices[1] = (padded_second_ubatch_slice, padded_second_ubatch_slice)
|
||||||
|
|
||||||
# Need to assert that none of the padding is on the first ubatch
|
|
||||||
assert padded_first_ubatch_slice.stop - padded_first_ubatch_slice.start
|
|
||||||
|
|
||||||
# if (num_pad_tokens_first_ubatch > 0):
|
# if (num_pad_tokens_first_ubatch > 0):
|
||||||
# print(f"FIRST UBATCH PADDING {num_pad_tokens_first_ubatch} TOTAL: {max_tokens_across_dp_cpu} ORIGINAL{first_ubatch_num_tokens}")
|
# print(f"FIRST UBATCH PADDING {num_pad_tokens_first_ubatch} TOTAL: {max_tokens_across_dp_cpu} ORIGINAL{first_ubatch_num_tokens}")
|
||||||
# if (num_pad_tokens_second_ubatch > 0):
|
# if (num_pad_tokens_second_ubatch > 0):
|
||||||
# print(f"SECOND UBATCH PADDING {num_pad_tokens_second_ubatch} TOTAL: {max_tokens_across_dp_cpu} ORIGINAL{second_ubatch_num_tokens}")
|
# print(f"SECOND UBATCH PADDING {num_pad_tokens_second_ubatch} TOTAL: {max_tokens_across_dp_cpu} ORIGINAL{second_ubatch_num_tokens}")
|
||||||
|
# print(f"num padded tokens: {num_pad_tokens} num tokens tensor: {num_tokens_after_padding} first num_tokens: {first_ubatch_num_tokens} second num tokens {second_ubatch_num_tokens}")
|
||||||
|
|
||||||
|
# This is where the second ubatch is adjusted to account for the padding.
|
||||||
|
# Should be called after attention metadata creation. This just extends
|
||||||
|
# the second ubatch slice out to the total number of tokens
|
||||||
|
# (num_tokens + padding)
|
||||||
|
def pad_out_ubatch_second_stage(self, ubatch_slices: UBatchSlices, num_total_tokens: int):
|
||||||
|
# TODO Add asserts to make sure stage one ran
|
||||||
|
padded_second_ubatch_slice = slice(ubatch_slices[1][1].start, num_total_tokens)
|
||||||
|
ubatch_slices[1] = (ubatch_slices[1][0], padded_second_ubatch_slice)
|
||||||
|
|
||||||
num_pad_tokens = (max_tokens_across_dp * 2) - \
|
|
||||||
(first_ubatch_num_tokens + second_ubatch_num_tokens)
|
|
||||||
print(f"num padded tokens: {num_pad_tokens} num tokens tensor: {num_tokens_after_padding} first num_tokens: {first_ubatch_num_tokens} second num tokens {second_ubatch_num_tokens}")
|
|
||||||
return num_pad_tokens, num_tokens_after_padding
|
|
||||||
|
|
||||||
def should_ubatch(self, should_ubatch: bool) -> bool:
|
def should_ubatch(self, should_ubatch: bool) -> bool:
|
||||||
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
||||||
@ -1430,12 +1451,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def _ubatch_thread(ubatch_ctx, token_slice, results, save_results,
|
def _ubatch_thread(ubatch_ctx, token_slice, results, save_results,
|
||||||
use_dummy_input):
|
use_dummy_input):
|
||||||
# print(f"Starting Request on ubatch: {ubatch_ctx.id}", flush=True)
|
print(f"Starting Request on ubatch: {ubatch_ctx.id}", flush=True)
|
||||||
model_output = _run(token_slice, ubatch_ctx, use_dummy_input)
|
model_output = _run(token_slice, ubatch_ctx, use_dummy_input)
|
||||||
|
|
||||||
if save_results:
|
if save_results:
|
||||||
results.append((ubatch_ctx.id, model_output))
|
results.append((ubatch_ctx.id, model_output))
|
||||||
# print(f"Finishing Request on ubatch: {ubatch_ctx.id}", flush=True)
|
print(f"Finishing Request on ubatch: {ubatch_ctx.id}", flush=True)
|
||||||
|
|
||||||
def _run_ubatches(ubatch_slices, attn_metadata,
|
def _run_ubatches(ubatch_slices, attn_metadata,
|
||||||
is_dummy_run, num_tokens_across_dp) -> torch.Tensor:
|
is_dummy_run, num_tokens_across_dp) -> torch.Tensor:
|
||||||
@ -1498,7 +1519,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
else:
|
else:
|
||||||
# print("RUN NORMAL")
|
# print("RUN NORMAL")
|
||||||
# No padding for the non ubatch case
|
# No padding for the non ubatch case
|
||||||
assert not num_tokens_across_dp
|
assert num_tokens_across_dp is None
|
||||||
model_output = _run(
|
model_output = _run(
|
||||||
slice(0, num_scheduled_tokens),
|
slice(0, num_scheduled_tokens),
|
||||||
set_forward_context(attn_metadata,
|
set_forward_context(attn_metadata,
|
||||||
@ -1524,16 +1545,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
|
|
||||||
return self.kv_connector_no_forward(scheduler_output)
|
return self.kv_connector_no_forward(scheduler_output)
|
||||||
|
|
||||||
|
# num_scheduled_tokens_old = scheduler_output.total_num_scheduled_tokens
|
||||||
|
# num_pad_tokens, num_tokens_after_padding = self.get_dp_padding(num_scheduled_tokens_old)
|
||||||
# Prepare the decoder inputs.
|
# Prepare the decoder inputs.
|
||||||
attn_metadata, logits_indices, spec_decode_metadata, ubatch_slices = (
|
attn_metadata, logits_indices, spec_decode_metadata, ubatch_slices, num_pad_tokens, num_tokens_after_padding = (
|
||||||
self._prepare_inputs(scheduler_output))
|
self._prepare_inputs(scheduler_output))
|
||||||
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||||
|
if ubatch_slices and num_pad_tokens > 0:
|
||||||
num_tokens_after_padding = None
|
|
||||||
if ubatch_slices:
|
|
||||||
num_pad_tokens, num_tokens_after_padding = \
|
|
||||||
self.get_dp_padding_ubatch(ubatch_slices)
|
|
||||||
num_scheduled_tokens += num_pad_tokens
|
num_scheduled_tokens += num_pad_tokens
|
||||||
|
self.pad_out_ubatch_second_stage(ubatch_slices, num_scheduled_tokens)
|
||||||
|
else:
|
||||||
|
num_tokens_after_padding = None
|
||||||
|
|
||||||
# Run the decoder.
|
# Run the decoder.
|
||||||
# Use persistent buffers for CUDA graphs.
|
# Use persistent buffers for CUDA graphs.
|
||||||
self.maybe_setup_kv_connector(scheduler_output)
|
self.maybe_setup_kv_connector(scheduler_output)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user