gpu_model_runner cleanup

Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
Sage Moore 2025-07-03 16:23:11 +00:00
parent 0056be26f6
commit f7b6e600b8

View File

@ -1411,12 +1411,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
ubatch_slices[0] = (padded_first_ubatch_slice, padded_first_ubatch_slice)
ubatch_slices[1] = (padded_second_ubatch_slice, padded_second_ubatch_slice)
# if (num_pad_tokens_first_ubatch > 0):
# print(f"FIRST UBATCH PADDING {num_pad_tokens_first_ubatch} TOTAL: {max_tokens_across_dp_cpu} ORIGINAL{first_ubatch_num_tokens}")
# if (num_pad_tokens_second_ubatch > 0):
# print(f"SECOND UBATCH PADDING {num_pad_tokens_second_ubatch} TOTAL: {max_tokens_across_dp_cpu} ORIGINAL{second_ubatch_num_tokens}")
# print(f"num padded tokens: {num_pad_tokens} num tokens tensor: {num_tokens_after_padding} first num_tokens: {first_ubatch_num_tokens} second num tokens {second_ubatch_num_tokens}")
# This is where the second ubatch is adjusted to account for the padding.
# Should be called after attention metadata creation. This just extends
# the second ubatch slice out to the total number of tokens
@ -1426,15 +1420,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
padded_second_ubatch_slice = slice(ubatch_slices[1][1].start, num_total_tokens)
ubatch_slices[1] = (padded_second_ubatch_slice, padded_second_ubatch_slice)
# Returns num_padded_tokens. This is just a number that should be added to the
# current number of tokens. It is a sum of the number of padded tokens from DP
# padding along with the number of padded tokens from cudagraph padding.
# The second tensor object is None when DP is disabled. When DP is enabled.
# it contains the number of tokens on each dp rank
def compute_padding(self,) -> tuple[int, Optional[torch.Tensor]]:
return (0, torch.Tensor())
def should_ubatch(self, should_ubatch: bool) -> bool:
dp_size = self.vllm_config.parallel_config.data_parallel_size
dp_rank = self.vllm_config.parallel_config.data_parallel_rank
@ -1481,28 +1466,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
tokens_slice = slice(tokens_slice.start, tokens_slice.start + 1)
num_tokens = 1
# if (self.use_cuda_graph
# and num_tokens <= self.cudagraph_batch_sizes[-1]):
# # Use piecewise CUDA graphs.
# # Add padding to the batch size.
# tokens_slice = \
# slice(tokens_slice.start, tokens_slice.start+
# self.vllm_config.pad_for_cudagraph(num_tokens))
# else:
# # Eager mode.
# # Pad tokens to multiple of tensor_parallel_size when
# # enabled collective fusion for SP
# tp_size = self.vllm_config.parallel_config.tensor_parallel_size
# if self.vllm_config.compilation_config.pass_config. \
# enable_sequence_parallelism and tp_size > 1:
# from vllm.utils import round_up
# tokens_slice = slice(
# tokens_slice.start,
# tokens_slice.start + round_up(num_tokens, tp_size))
# update num tokens for padding
# num_tokens = tokens_slice.stop - tokens_slice.start
# _prepare_inputs may reorder the batch, so we must gather multi
# modal outputs after that to ensure the correct order
if self.is_multimodal_model:
@ -1590,8 +1553,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
def model_inputs(tokens_slice: slice, use_dummy_input: bool) -> tuple:
if use_dummy_input:
# print("MAKING DUMMY BATCH")
# assert num_dummy_tokens == 1
return self._get_dummy_model_inputs(num_dummy_tokens)
else:
assert scheduler_output is not None
@ -1610,7 +1571,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_tokens_across_dp=num_tokens_across_dp,
skip_cuda_graphs=skip_cuda_graphs
)
# First get some inputs
ubatch_metadata: list[UbatchMetadata] = []
for i, (_, tokens_slice) in enumerate(ubatch_slices):
input_ids, positions, inputs_embeds, intermediate_tensors = \
@ -1683,9 +1644,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# run micro-batched
if ubatch_slices is not None:
assert len(ubatch_slices) == 2, "Only two ubatches has been tested"
# num_tokens = ubatch_slices[1][1].stop
print(f"RUNNING UBATCH {ubatch_slices} is_dummy_run: {is_dummy_run} num_tokens_across_dp{num_tokens_across_dp}")
# assert not is_dummy_run
compute_stream = torch.cuda.current_stream()
ubatch_metadata = _make_ubatch_metadata(
ubatch_slices=ubatch_slices,
@ -1696,7 +1656,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
skip_cuda_graphs=skip_cuda_graphs
)
return _run_ubatches(ubatch_metadata)
# run single batch
# run normal batch
else:
input_ids, positions, inputs_embeds, intermediate_tensors = \
model_inputs(slice(0, num_scheduled_tokens), is_dummy_run)
@ -2038,7 +1998,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if spec_decode_metadata is None:
# input_ids can be None for multimodal models.
target_token_ids = self.input_ids[:num_scheduled_tokens]
#TODO(sage) make sure this works with mrope
# TODO(woosuk): Support M-RoPE.
target_positions = self.positions[:num_scheduled_tokens]
if self.use_aux_hidden_state_outputs:
@ -2068,7 +2027,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_tokens,
)
target_token_ids = self.input_ids[token_indices]
#TODO(sage) make sure this works with mrope
# TODO(woosuk): Support M-RoPE.
target_positions = self.positions[token_indices]
if self.use_aux_hidden_state_outputs: