From d100d78eb31ee8db5d987fa0d2dc18bc96b52d3a Mon Sep 17 00:00:00 2001 From: "Grant Holmes (Ren)" <60802511+gholmes829@users.noreply.github.com> Date: Tue, 7 Oct 2025 04:20:30 -0500 Subject: [PATCH] Optimize KV cache distribution for asymmetric pipeline parallelism (#25164) Signed-off-by: gholmes829 --- tests/v1/core/test_kv_cache_utils.py | 10 ++-- vllm/config/cache.py | 2 +- vllm/entrypoints/llm.py | 2 +- vllm/v1/core/kv_cache_utils.py | 82 ++++++++++++++++++---------- vllm/v1/worker/gpu_worker.py | 6 +- 5 files changed, 64 insertions(+), 38 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index aed00a60aeb4..a31817ec72b6 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -681,10 +681,10 @@ def test_get_kv_cache_configs_multiple_workers(): num_blocks=10, kv_cache_tensors=[ KVCacheTensor( - size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer1"] + size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"] ), KVCacheTensor( - size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer2"] + size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer2"] ), ], kv_cache_groups=[ @@ -718,7 +718,7 @@ def test_get_kv_cache_configs_multiple_workers(): num_blocks=10, kv_cache_tensors=[ KVCacheTensor( - size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer1"] + size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer1"] ), ], kv_cache_groups=[ @@ -802,7 +802,7 @@ def test_get_kv_cache_configs_multiple_workers(): num_blocks=10, kv_cache_tensors=[ KVCacheTensor( - size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer3"] + size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer3"] ), ], kv_cache_groups=[ @@ -813,7 +813,7 @@ def test_get_kv_cache_configs_multiple_workers(): num_blocks=10, kv_cache_tensors=[ KVCacheTensor( - size=ref_kv_cache_spec.page_size_bytes * 20, shared_by=["layer3"] + size=ref_kv_cache_spec.page_size_bytes * 10, shared_by=["layer3"] ), ], kv_cache_groups=[ diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 3d2496b7f21d..519e3d8b9bde 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -124,7 +124,7 @@ class CacheConfig: gpu_memory_utilization. However, users may want to manually specify the kv cache memory size. kv_cache_memory_bytes allows more fine-grain control of how much memory gets used when compared with using - gpu_memory_memory_utilization. Note that kv_cache_memory_bytes + gpu_memory_utilization. Note that kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization""" def compute_hash(self) -> str: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index b1a68c216323..c797735f0c2d 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -143,7 +143,7 @@ class LLM: size based on gpu_memory_utilization. However, users may want to manually specify the kv cache memory size. kv_cache_memory_bytes allows more fine-grain control of how much memory gets used when - compared with using gpu_memory_memory_utilization. Note that + compared with using gpu_memory_utilization. Note that kv_cache_memory_bytes (when not-None) ignores gpu_memory_utilization swap_space: The size (GiB) of CPU memory per GPU to use as swap space. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 4683ad62981f..bb0b7e259b41 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1113,35 +1113,12 @@ def get_kv_cache_config_from_groups( KVCacheTensor(size=page_size * num_blocks, shared_by=shared_by) ) - kv_cache_config = KVCacheConfig( + return KVCacheConfig( num_blocks=num_blocks, kv_cache_tensors=kv_cache_tensors, kv_cache_groups=kv_cache_groups, ) - min_block_size = min([group.kv_cache_spec.block_size for group in kv_cache_groups]) - - # Print the KV cache size and maximum concurrency. - num_tokens = num_blocks // len(kv_cache_groups) * min_block_size - if vllm_config.parallel_config.decode_context_parallel_size > 1: - num_tokens *= vllm_config.parallel_config.decode_context_parallel_size - logger.info( - "Multiplying the GPU KV cache size by the dcp_world_size %d.", - vllm_config.parallel_config.decode_context_parallel_size, - ) - num_tokens_str = f"{num_tokens:,}" - logger.info("GPU KV cache size: %s tokens", num_tokens_str) - max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" - max_concurrency = get_max_concurrency_for_kv_cache_config( - vllm_config, kv_cache_config - ) - logger.info( - "Maximum concurrency for %s tokens per request: %.2fx", - max_model_len_str, - max_concurrency, - ) - return kv_cache_config - def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): """ @@ -1265,6 +1242,45 @@ def generate_scheduler_kv_cache_config( return cfg +def _report_kv_cache_config( + vllm_config: VllmConfig, kv_cache_config: KVCacheConfig +) -> None: + """ + Log resolved KV cache configuration. + + Args: + vllm_config: The global VllmConfig + kv_cache_config: The resolved KV cache configuration + """ + min_block_size = min( + [group.kv_cache_spec.block_size for group in kv_cache_config.kv_cache_groups] + ) + + # Log the KV cache size and maximum concurrency. + num_tokens = ( + kv_cache_config.num_blocks + // len(kv_cache_config.kv_cache_groups) + * min_block_size + ) + if vllm_config.parallel_config.decode_context_parallel_size > 1: + num_tokens *= vllm_config.parallel_config.decode_context_parallel_size + logger.info( + "Multiplying the GPU KV cache size by the dcp_world_size %d.", + vllm_config.parallel_config.decode_context_parallel_size, + ) + num_tokens_str = f"{num_tokens:,}" + logger.info("GPU KV cache size: %s tokens", num_tokens_str) + max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" + max_concurrency = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config + ) + logger.info( + "Maximum concurrency for %s tokens per request: %.2fx", + max_model_len_str, + max_concurrency, + ) + + def get_kv_cache_configs( vllm_config: VllmConfig, kv_cache_specs: list[dict[str, KVCacheSpec]], @@ -1284,7 +1300,8 @@ def get_kv_cache_configs( 3. Generate the KV cache configs for each worker based on the KV cache grouping strategy. (This is reasonable because the layer ratio of different PP stages are similar.) - 4. Change the num_blocks of each worker to the smallest among all workers. + 4. Change the num_blocks of each worker to the smallest among all workers + and shrink tensor sizes proportionally to avoid allocating unused memory. Args: vllm_config: The global VllmConfig @@ -1345,13 +1362,22 @@ def get_kv_cache_configs( ) ) - # Change the num_blocks of each rank to the smallest among all ranks. We - # do not need to shrink the tensor size because it is valid to only use the - # first `num_blocks` blocks of the tensor. + # Change the num_blocks of each rank to the smallest among all ranks. + # We also need to shrink the tensor size proportionally to avoid + # allocating unused memory. min_num_blocks = min( kv_cache_config.num_blocks for kv_cache_config in kv_cache_configs ) for kv_cache_config in kv_cache_configs: + num_blocks_old = kv_cache_config.num_blocks kv_cache_config.num_blocks = min_num_blocks + # Shrink tensor size proportionally + for tensor in kv_cache_config.kv_cache_tensors: + assert tensor.size % num_blocks_old == 0 + tensor.size = tensor.size // num_blocks_old * min_num_blocks + + if len(kv_cache_config.kv_cache_groups) > 0: + _report_kv_cache_config(vllm_config, kv_cache_config) + return kv_cache_configs diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 271aabb9e227..70f7c1d45b5f 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -253,10 +253,10 @@ class Worker(WorkerBase): self.model_runner.profile_run() msg = ( - f"Initial free memory {GiB(self.init_snapshot.free_memory)} " - f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f}GiB memory for " + f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} " + f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for " "KV Cache as specified by kv_cache_memory_bytes config and " - "skipped memory profiling. This does does not respect the " + "skipped memory profiling. This does not respect the " "gpu_memory_utilization config. Only use kv_cache_memory_bytes " "config when you want manual control of KV cache memory " "size. If OOM'ed, check the difference of initial free "