From 0e499c4f4d75642f8337fa091bcfdfadceb5247d Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Wed, 2 Jul 2025 21:11:28 +0000 Subject: [PATCH] first round of cleanups Signed-off-by: Sage Moore --- examples/offline_inference/data_parallel.py | 5 ++- vllm/compilation/decorators.py | 1 - vllm/v1/worker/gpu_model_runner.py | 37 ++------------------- 3 files changed, 7 insertions(+), 36 deletions(-) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 5e00945708f32..bef336edc21ac 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -96,6 +96,7 @@ def main( trust_remote_code, max_num_seqs, gpu_memory_utilization, + enable_microbatching, ): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) @@ -140,7 +141,7 @@ def main( # sampling params. here we set different max_tokens for different # ranks for demonstration. sampling_params = SamplingParams( - temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2] + temperature=0.8, top_p=0.95, max_tokens=[40, 64][global_dp_rank % 2] ) # Create an LLM. @@ -152,6 +153,7 @@ def main( trust_remote_code=trust_remote_code, max_num_seqs=max_num_seqs, gpu_memory_utilization=gpu_memory_utilization, + enable_microbatching=enable_microbatching, ) outputs = llm.generate(prompts, sampling_params) # Print the outputs. @@ -208,6 +210,7 @@ if __name__ == "__main__": args.trust_remote_code, args.max_num_seqs, args.gpu_memory_utilization, + args.enable_microbatching, ), ) proc.start() diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 54d5af2ad29d8..5f1b268a1d6fe 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -157,7 +157,6 @@ def _support_torch_compile( vllm_config.compilation_config.level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS ] or not supports_dynamo() - self.do_not_compile = True if self.do_not_compile: return compilation_counter.num_models_seen += 1 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 36e846fc23316..94a433c36b872 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -98,13 +98,7 @@ PerLayerAttnMetadata: TypeAlias = Union[list[AttnMetadataDict], UbatchSlice: TypeAlias = tuple[slice, slice] UBatchSlices: TypeAlias = list[UbatchSlice] - import dataclasses -@dataclasses.dataclass -class CUDAGraphMetaData: - cudagraph: torch.cuda.CUDAGraph - using_ubatching: bool - outputs: Optional[Any] = None class GPUModelRunner(LoRAModelRunnerMixin): @@ -148,7 +142,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs - self.cudagraphs = {} # Model-related. self.num_query_heads = model_config.get_num_attention_heads( parallel_config) @@ -1402,9 +1395,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): return num_dp_pad_tokens + num_pad_tokens, num_tokens_after_padding - def get_dp_padding_ubatch(self, - ubatch_slices: UBatchSlices, - include_cudagraphs: bool = True) -> tuple[int, Optional[torch.Tensor]]: + def get_dp_padding_ubatch(self, + ubatch_slices: UBatchSlices) -> tuple[int, Optional[torch.Tensor]]: dp_size = self.vllm_config.parallel_config.data_parallel_size if dp_size == 1: @@ -1424,18 +1416,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_tokens_unpadded = first_ubatch_num_tokens + second_ubatch_num_tokens num_tokens_padded = round_up(num_tokens_unpadded, 2) - if (include_cudagraphs and self.use_cuda_graph - and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]): - # Add padding to the batch size. - num_tokens_padded = self.vllm_config.pad_for_cudagraph(num_tokens_unpadded) - else: - # Eager mode. - # Pad tokens to multiple of tensor_parallel_size when - # enabled collective fusion for SP - tp_size = self.vllm_config.parallel_config.tensor_parallel_size - if self.vllm_config.compilation_config.pass_config. \ - enable_sequence_parallelism and tp_size > 1: - num_tokens_padded = round_up(num_tokens_unpadded, tp_size) num_tokens_per_ubatch = num_tokens_padded // 2 @@ -1602,8 +1582,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): scheduler_output: Optional["SchedulerOutput"] = None, is_dummy_run: bool = False, num_tokens_across_dp: Optional[torch.Tensor] = None, - skip_cuda_graphs: bool = False, - build_cuda_graph: bool = False): + skip_cuda_graphs: bool = False): @dataclasses.dataclass class UbatchMetadata: @@ -2430,12 +2409,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): def _dummy_run( self, num_tokens: int, - skip_attn: bool = True, # Maybe return a cudagraph here capture_attn_cudagraph: bool = False, skip_eplb: bool = False, is_profile: bool = False, - build_cuda_graph: bool = False ) -> tuple[torch.Tensor, torch.Tensor]: # if allow_microbatching: @@ -2469,7 +2446,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) - ubatch_slices = None # We currently only microbatch if the number of tokens is # over a certain threshold. # logger.info("PADDING DUMMY DONE") @@ -2486,8 +2462,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): seq_lens = self.seq_lens[:num_reqs] max_query_len = num_tokens - if ubatch_slices is not None: - max_query_len = 1 common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, seq_lens=seq_lens, @@ -2510,10 +2484,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): outputs = self._run_model( attn_metadata, num_tokens, - ubatch_slices=ubatch_slices, is_dummy_run=True, num_tokens_across_dp=num_tokens_across_dp, - build_cuda_graph=build_cuda_graph ) if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs @@ -2754,13 +2726,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): start_time = time.perf_counter() start_free_gpu_memory = torch.cuda.mem_get_info()[0] - logger.info("CAPTURE MODEL START") # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. with graph_capture(device=self.device): full_cg = self.full_cuda_graph - allow_microbatching = False for num_tokens in tqdm(reversed(self.cudagraph_batch_sizes), desc="Capturing CUDA graphs", total=len(self.cudagraph_batch_sizes)): @@ -2774,7 +2744,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): capture_attn_cudagraph=full_cg, skip_eplb=True) - logger.info("CAPTURE MODEL END") end_time = time.perf_counter() end_free_gpu_memory = torch.cuda.mem_get_info()[0] elapsed_time = end_time - start_time