diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e95f4594c0a13..f98fafe7996aa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1649,7 +1649,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): if ubatch_slices is not None: # num_tokens = ubatch_slices[1][1].stop # print(f"RUNNING UBATCH {num_tokens} is_dummy_run: {is_dummy_run} num_tokens_across_dp{num_tokens_across_dp}") - assert not is_dummy_run + # assert not is_dummy_run model_output = _run_ubatches(ubatch_slices, attn_metadata, is_dummy_run, num_tokens_across_dp=num_tokens_across_dp) # run single batch @@ -2224,10 +2224,22 @@ class GPUModelRunner(LoRAModelRunnerMixin): allow_microbatching: bool = False, ) -> torch.Tensor: - should_microbatch = False + if allow_microbatching: + logger.info("ATTEMPTING TO UBATCH THE DUMMY RUN") + + + # TODO(Sage) We need some more code to properly handle + # mixing normal and dummy runs. The DP padding needs to + # be properly setup. Since we only support microbatching + # in cuda graph capture it's fine to ignore the DP padding + # for now. + should_ubatch = num_tokens >= \ + self.parallel_config.microbatching_token_threshold and \ + allow_microbatching # _dummy_run doesn't go through _prepare_inputs so # we synchronize with other DP ranks here - self.should_ubatch(should_microbatch) + should_ubatch = self.should_ubatch(allow_microbatching) + assert not should_ubatch # Padding for DP # logger.info("PADDING DUMMY") num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) @@ -2278,19 +2290,22 @@ class GPUModelRunner(LoRAModelRunnerMixin): for layer_name in kv_cache_group_spec.layer_names: attn_metadata[layer_name] = attn_metadata_i - # should_microbatch = ( - # allow_microbatching - # and self.vllm_config.parallel_config.enable_microbatching - # and self.vllm_config.parallel_config.always_microbatch_if_enabled) - # dummy_microbatches = [(slice(0, 0), slice(0, 0)), - # (slice(0, 0), slice(0, 0))] + dummy_microbatches = None + # We currently only microbatch if the number of tokens is + # over a certain threshold. + if should_ubatch: + assert num_tokens % 2 == 0 + # TODO (Sage) Add actual slices here + assert False + dummy_microbatches = [(slice(0, 0), slice(0, 0)), + (slice(0, 0), slice(0, 0))] with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): outputs = self._run_model( attn_metadata, num_tokens, - ubatch_slices=None, + ubatch_slices=dummy_microbatches, is_dummy_run=True, num_tokens_across_dp=num_tokens_across_dp ) @@ -2488,8 +2503,12 @@ class GPUModelRunner(LoRAModelRunnerMixin): total=len(self.cudagraph_batch_sizes)): for _ in range( self.compilation_config.cudagraph_num_of_warmups): - self._dummy_run(num_tokens, capture_attn_cudagraph=full_cg) - self._dummy_run(num_tokens, capture_attn_cudagraph=full_cg) + self._dummy_run(num_tokens, + capture_attn_cudagraph=full_cg, + allow_microbatching=allow_microbatching) + self._dummy_run(num_tokens, + capture_attn_cudagraph=full_cg, + allow_microbatching=allow_microbatching) logger.info("CAPTURE MODEL END") end_time = time.perf_counter() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index e485cfcfa8f99..8f132ee67ae5f 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -319,7 +319,7 @@ class Worker(WorkerBase): def execute_dummy_batch(self) -> None: # TODO: adding allow_microbatching will break non-gpu backends - self.model_runner._dummy_run(1, allow_microbatching=True) + self.model_runner._dummy_run(1) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request)