mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 07:05:01 +08:00
[Bug] Fix Negative Cuda Memory Usage (#25683)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
aac622e0cd
commit
da554f932e
@ -3517,7 +3517,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
compilation_counter.num_gpu_runner_capture_triggers += 1
|
compilation_counter.num_gpu_runner_capture_triggers += 1
|
||||||
|
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
start_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def freeze_gc():
|
def freeze_gc():
|
||||||
@ -3540,6 +3539,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
# can reuse the memory pool allocated for the large shapes.
|
# can reuse the memory pool allocated for the large shapes.
|
||||||
set_cudagraph_capturing_enabled(True)
|
set_cudagraph_capturing_enabled(True)
|
||||||
with freeze_gc(), graph_capture(device=self.device):
|
with freeze_gc(), graph_capture(device=self.device):
|
||||||
|
start_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||||
cudagraph_mode = self.compilation_config.cudagraph_mode
|
cudagraph_mode = self.compilation_config.cudagraph_mode
|
||||||
assert cudagraph_mode is not None
|
assert cudagraph_mode is not None
|
||||||
if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
|
if cudagraph_mode.mixed_mode() != CUDAGraphMode.NONE:
|
||||||
@ -3568,6 +3568,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
cudagraph_runtime_mode=CUDAGraphMode.FULL,
|
cudagraph_runtime_mode=CUDAGraphMode.FULL,
|
||||||
uniform_decode=True)
|
uniform_decode=True)
|
||||||
|
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||||
|
|
||||||
# Disable cudagraph capturing globally, so any unexpected cudagraph
|
# Disable cudagraph capturing globally, so any unexpected cudagraph
|
||||||
# capturing will be detected and raise an error after here.
|
# capturing will be detected and raise an error after here.
|
||||||
# Note: We don't put it into graph_capture context manager because
|
# Note: We don't put it into graph_capture context manager because
|
||||||
@ -3576,7 +3579,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
set_cudagraph_capturing_enabled(False)
|
set_cudagraph_capturing_enabled(False)
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
end_free_gpu_memory = torch.cuda.mem_get_info()[0]
|
|
||||||
elapsed_time = end_time - start_time
|
elapsed_time = end_time - start_time
|
||||||
cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
|
cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
|
||||||
# This usually takes 5~20 seconds.
|
# This usually takes 5~20 seconds.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user