mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-11 08:17:03 +08:00
misc changes
Signed-off-by: Sage Moore <sage@neuralmagic.com>
This commit is contained in:
parent
b74c731342
commit
1d112d90a5
@ -105,6 +105,7 @@ class CUDAPiecewiseBackend:
|
||||
end_monitoring_torch_compile(self.vllm_config)
|
||||
|
||||
def __call__(self, *args) -> Any:
|
||||
# logger.info("CUDA BACKEND CALL")
|
||||
if not self.first_run_finished:
|
||||
self.first_run_finished = True
|
||||
self.check_for_ending_compilation()
|
||||
|
||||
@ -170,6 +170,7 @@ def _support_torch_compile(
|
||||
# e.g. TPU has the compilation logic in model runner, so we don't
|
||||
# need to compile the model inside.
|
||||
if self.do_not_compile or torch.compiler.is_compiling():
|
||||
logger.info("SKIPPING COMPILATION")
|
||||
return self.forward(*args, **kwargs)
|
||||
|
||||
# the first compilation needs to have dynamic shapes marked
|
||||
|
||||
@ -134,14 +134,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
do_recv=not send,
|
||||
)
|
||||
|
||||
# yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||
yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||
dispatch(True) # Send
|
||||
# torch.cuda.synchronize()
|
||||
# print(f"{ubatch_id} AFTER SEND SYNC", flush=True)
|
||||
dispatch(False) # Recv
|
||||
# torch.cuda.synchronize()
|
||||
# print(f"{ubatch_id} AFTER RECV SYNC", flush=True)
|
||||
# yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
||||
yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
||||
# torch.cuda.synchronize()
|
||||
if expert_x_scale is not None:
|
||||
expert_x_scale = expert_x_scale[:, :, 0:1]
|
||||
@ -185,11 +185,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
|
||||
do_recv=not send,
|
||||
)
|
||||
|
||||
# yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||
yield_and_switch_from_compute_to_comm_impl(schedule="default")
|
||||
combine(True)
|
||||
# torch.cuda.synchronize()
|
||||
# print(f"{ubatch_id} AFTER COMBINE SEND SYNC", flush=True)
|
||||
combine(False)
|
||||
# print(f"{ubatch_id} AFTER COMBINE RECV SYNC", flush=True)
|
||||
# yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
||||
yield_and_switch_from_comm_to_compute_impl(schedule="default")
|
||||
# torch.cuda.synchronize()
|
||||
|
||||
@ -206,6 +206,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.use_cuda_graph = (self.vllm_config.compilation_config.level
|
||||
== CompilationLevel.PIECEWISE
|
||||
and not self.model_config.enforce_eager)
|
||||
logger.info(f"self.use_cuda_graph {self.use_cuda_graph}")
|
||||
# TODO(woosuk): Provide an option to tune the max cudagraph batch size.
|
||||
# The convention is different.
|
||||
# self.cudagraph_batch_sizes sorts in ascending order.
|
||||
@ -702,7 +703,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
if num_pad_tokens < scheduler_output.total_num_scheduled_tokens:
|
||||
self.pad_out_ubatch_first_stage(ubatch_slices, num_pad_tokens)
|
||||
else:
|
||||
assert False
|
||||
# We bail out of ubatching here. This accounts for the case where
|
||||
# the padding would result in an "empty" second ubatch.
|
||||
# TODO: just make the second ubatch a dummy ubatch
|
||||
@ -711,13 +711,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# Note that if we are attempting to ubatch by this point then we know that no
|
||||
# DP ranks are doing dummy runs
|
||||
# if ubatch_slices:
|
||||
# should_ubatch = self.should_ubatch(False if ubatch_bailout else True)
|
||||
# if not should_ubatch:
|
||||
# logger.info("SUCCESSFULLY BAILED OUT")
|
||||
# num_pad_tokens = 0
|
||||
# num_tokens_after_padding = None
|
||||
# ubatch_slices = None
|
||||
if ubatch_slices:
|
||||
should_ubatch = self.should_ubatch(False if ubatch_bailout else True)
|
||||
if not should_ubatch:
|
||||
logger.info("SUCCESSFULLY BAILED OUT")
|
||||
num_pad_tokens = 0
|
||||
num_tokens_after_padding = None
|
||||
ubatch_slices = None
|
||||
|
||||
|
||||
# This AR is only necessary in the case described above where
|
||||
@ -2150,6 +2150,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
# For profiling runs we dont want microbatching but for
|
||||
# dp dummy runs we do.
|
||||
allow_microbatching: bool = False,
|
||||
# Maybe return a cudagraph here
|
||||
) -> torch.Tensor:
|
||||
|
||||
should_microbatch = False
|
||||
@ -2402,19 +2403,24 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
start_time = time.perf_counter()
|
||||
start_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||
|
||||
logger.info("CAPTURE MODEL START")
|
||||
# Trigger CUDA graph capture for specific shapes.
|
||||
# Capture the large shapes first so that the smaller shapes
|
||||
# can reuse the memory pool allocated for the large shapes.
|
||||
with graph_capture(device=self.device):
|
||||
skip_attn = not self.vllm_config.compilation_config.full_cuda_graph
|
||||
allow_microbatching = self.parallel_config.enable_microbatching
|
||||
for num_tokens in reversed(self.cudagraph_batch_sizes):
|
||||
for _ in range(self.vllm_config.compilation_config.
|
||||
cudagraph_num_of_warmups):
|
||||
self._dummy_run(num_tokens, skip_attn=skip_attn)
|
||||
self._dummy_run(num_tokens, skip_attn=skip_attn,
|
||||
allow_microbatching=allow_microbatching)
|
||||
# print("CUDAGRAPH CAPTURE START")
|
||||
self._dummy_run(num_tokens, skip_attn=skip_attn)
|
||||
self._dummy_run(num_tokens, skip_attn=skip_attn,
|
||||
allow_microbatching=allow_microbatching)
|
||||
# print("CUDAGRAPH CAPTURE END")
|
||||
|
||||
logger.info("CAPTURE MODEL END")
|
||||
end_time = time.perf_counter()
|
||||
end_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||
elapsed_time = end_time - start_time
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user