mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 05:34:57 +08:00
[BugFix][V1] Fix memory profiling bug (#18974)
Signed-off-by: luka <luka@neuralmagic.com>
This commit is contained in:
parent
88be823d57
commit
2d8476e465
@ -86,6 +86,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
|
|||||||
} if model_info.speculative_model else None,
|
} if model_info.speculative_model else None,
|
||||||
trust_remote_code=model_info.trust_remote_code,
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
max_model_len=model_info.max_model_len,
|
max_model_len=model_info.max_model_len,
|
||||||
|
# these tests seem to produce leftover memory
|
||||||
|
gpu_memory_utilization=0.80,
|
||||||
load_format="dummy",
|
load_format="dummy",
|
||||||
hf_overrides=hf_overrides,
|
hf_overrides=hf_overrides,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -42,7 +42,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
|
|||||||
#TODO: enable this once we support it for
|
#TODO: enable this once we support it for
|
||||||
# prompt logprobs.
|
# prompt logprobs.
|
||||||
enable_prefix_caching=request.param,
|
enable_prefix_caching=request.param,
|
||||||
gpu_memory_utilization=0.5,
|
gpu_memory_utilization=0.4, # up to 2 alive concurrently
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
yield vllm_model
|
yield vllm_model
|
||||||
|
|
||||||
@ -343,10 +343,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
|
|||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
m.setenv("VLLM_USE_V1", "1")
|
m.setenv("VLLM_USE_V1", "1")
|
||||||
|
|
||||||
runner = VllmRunner("facebook/opt-125m",
|
runner = VllmRunner(
|
||||||
max_logprobs=1,
|
"facebook/opt-125m",
|
||||||
enable_prefix_caching=False,
|
max_logprobs=1,
|
||||||
max_model_len=256)
|
enable_prefix_caching=False,
|
||||||
|
# 2 other llms alive during whole session
|
||||||
|
gpu_memory_utilization=0.15,
|
||||||
|
max_model_len=256)
|
||||||
vllm_sampling_params = SamplingParams(logprobs=1)
|
vllm_sampling_params = SamplingParams(logprobs=1)
|
||||||
# should pass
|
# should pass
|
||||||
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
|
||||||
|
|||||||
@ -130,7 +130,20 @@ class Worker(WorkerBase):
|
|||||||
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
self.init_gpu_memory = torch.cuda.mem_get_info()[0]
|
self.init_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
|
||||||
|
requested_memory = (total_gpu_memory *
|
||||||
|
self.cache_config.gpu_memory_utilization)
|
||||||
|
if self.init_gpu_memory < requested_memory:
|
||||||
|
GiB = lambda b: round(b / GiB_bytes, 2)
|
||||||
|
raise ValueError(
|
||||||
|
f"Free memory on device ({GiB(self.init_gpu_memory)}/"
|
||||||
|
f"{GiB(total_gpu_memory)} GiB) on startup is less than "
|
||||||
|
f"desired GPU memory utilization "
|
||||||
|
f"({self.cache_config.gpu_memory_utilization}, "
|
||||||
|
f"{GiB(requested_memory)} GiB). Decrease GPU memory "
|
||||||
|
f"utilization or reduce GPU memory used by other processes."
|
||||||
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Not support device type: {self.device_config.device}")
|
f"Not support device type: {self.device_config.device}")
|
||||||
@ -190,28 +203,47 @@ class Worker(WorkerBase):
|
|||||||
# GPU did not change their memory usage during the profiling.
|
# GPU did not change their memory usage during the profiling.
|
||||||
assert self.init_gpu_memory > free_gpu_memory, (
|
assert self.init_gpu_memory > free_gpu_memory, (
|
||||||
"Error in memory profiling. "
|
"Error in memory profiling. "
|
||||||
f"Initial free memory {self.init_gpu_memory}, current free memory"
|
f"Initial free memory {self.init_gpu_memory/GiB_bytes} GiB, "
|
||||||
f" {free_gpu_memory}. This happens when the GPU memory was "
|
f"current free memory {free_gpu_memory/GiB_bytes} GiB. "
|
||||||
"not properly cleaned up before initializing the vLLM instance.")
|
f"This happens when the GPU memory was not properly cleaned up "
|
||||||
|
f"before initializing the vLLM instance.")
|
||||||
|
|
||||||
# Get the peak memory allocation recorded by torch
|
# Get the peak memory allocation recorded by torch
|
||||||
peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
|
peak_torch_memory = torch.cuda.memory_stats(
|
||||||
|
)["allocated_bytes.all.peak"]
|
||||||
|
|
||||||
# Check for any memory left around that may have been allocated on the
|
# Check for any memory left around that may have been allocated on the
|
||||||
# gpu outside of `torch`. NCCL operations, for example, can use a few
|
# gpu outside of `torch`. NCCL operations, for example, can use a few
|
||||||
# GB during a forward pass
|
# GB during a forward pass.
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
torch_allocated_bytes = torch.cuda.memory_stats(
|
torch_allocated_bytes = torch.cuda.memory_stats(
|
||||||
)["allocated_bytes.all.current"]
|
)["allocated_bytes.all.current"]
|
||||||
total_allocated_bytes = torch.cuda.mem_get_info(
|
|
||||||
)[1] - torch.cuda.mem_get_info()[0]
|
# Reset after emptying torch cache
|
||||||
non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
|
free_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||||
if non_torch_allocations > 0:
|
|
||||||
peak_memory += non_torch_allocations
|
# Total forward allocation (current) is equal to the diff in free memory
|
||||||
|
fwd_alloc_bytes = self.init_gpu_memory - free_gpu_memory
|
||||||
|
# We assume current non-torch allocation is equal to peak
|
||||||
|
non_torch_alloc_bytes = max(0, fwd_alloc_bytes - torch_allocated_bytes)
|
||||||
|
# Total forward allocation (peak) is peak torch + non-torch
|
||||||
|
peak_memory = peak_torch_memory + non_torch_alloc_bytes
|
||||||
|
|
||||||
available_kv_cache_memory = (
|
available_kv_cache_memory = (
|
||||||
total_gpu_memory * self.cache_config.gpu_memory_utilization -
|
total_gpu_memory * self.cache_config.gpu_memory_utilization -
|
||||||
peak_memory)
|
peak_memory)
|
||||||
|
|
||||||
|
GiB = lambda b: b / GiB_bytes
|
||||||
|
logger.debug(
|
||||||
|
"Initial free memory: %.2f GiB, free memory: %.2f GiB, "
|
||||||
|
"total GPU memory: %.2f GiB", GiB(self.init_gpu_memory),
|
||||||
|
GiB(free_gpu_memory), GiB(total_gpu_memory))
|
||||||
|
logger.debug(
|
||||||
|
"Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
|
||||||
|
"%.2f GiB, available KVCache memory: %.2f GiB",
|
||||||
|
GiB(peak_torch_memory), GiB(non_torch_alloc_bytes),
|
||||||
|
GiB(available_kv_cache_memory))
|
||||||
|
|
||||||
return int(available_kv_cache_memory)
|
return int(available_kv_cache_memory)
|
||||||
|
|
||||||
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
|
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user