mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-12 19:15:35 +08:00
[TPU] [Perf] Improve Memory Usage Estimation (#15671)
Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
parent
d03308be0c
commit
038bededba
@ -161,7 +161,13 @@ class TPUWorker:
|
|||||||
# intermediate activations.
|
# intermediate activations.
|
||||||
m = xm.get_memory_info(self.device)
|
m = xm.get_memory_info(self.device)
|
||||||
total_memory_size = m["bytes_limit"]
|
total_memory_size = m["bytes_limit"]
|
||||||
profiled = m["peak_bytes_used"] # Weights + intermediate activations.
|
current_mem = m["bytes_used"]
|
||||||
|
# Ideally we would use profiled = m["peak_bytes_used"] to
|
||||||
|
# get weights + activations. But there is memory used during
|
||||||
|
# compilation / weight loading that impacts the peak and
|
||||||
|
# there is no way to reset peak memory in XLA, So we
|
||||||
|
# use the heuristic of 2% of weights.
|
||||||
|
profiled = current_mem * 1.02
|
||||||
|
|
||||||
# Calculate the TPU KV cache size based on profiling.
|
# Calculate the TPU KV cache size based on profiling.
|
||||||
usable_memory_size = int(total_memory_size *
|
usable_memory_size = int(total_memory_size *
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user