mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-25 06:44:25 +08:00
[Model Runner V2] Optimize CUDA graph capture time (#29275)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
parent
b004c00418
commit
62d54ba46d
@ -106,7 +106,10 @@ class CudaGraphManager:
|
|||||||
input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
|
input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1)
|
||||||
input_buffers.query_start_loc.np[batch_size:] = batch_size
|
input_buffers.query_start_loc.np[batch_size:] = batch_size
|
||||||
input_buffers.query_start_loc.copy_to_gpu()
|
input_buffers.query_start_loc.copy_to_gpu()
|
||||||
input_buffers.seq_lens[:batch_size] = self.max_model_len
|
# HACK(woosuk): To optimize warmup time, we use 1 (instead of max_model_len)
|
||||||
|
# for seq_lens. This leads to a mismatch between seq_lens (GPU) and
|
||||||
|
# seq_lens_np (CPU), which might cause issues in some attention backends.
|
||||||
|
input_buffers.seq_lens[:batch_size] = 1
|
||||||
input_buffers.seq_lens[batch_size:] = 0
|
input_buffers.seq_lens[batch_size:] = 0
|
||||||
|
|
||||||
input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
|
input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables]
|
||||||
|
|||||||
@ -313,6 +313,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
return 0
|
return 0
|
||||||
|
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
start_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
start_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user