From 7a4a5de729466de4142e70b845e78a0171296468 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Fri, 18 Apr 2025 13:12:42 +0800 Subject: [PATCH] [Misc] Update outdated note: LMCache now supports chunked prefill (#16697) Signed-off-by: chaunceyjiang --- examples/offline_inference/cpu_offload_lmcache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py index 025444233d3b0..37aea281032fd 100644 --- a/examples/offline_inference/cpu_offload_lmcache.py +++ b/examples/offline_inference/cpu_offload_lmcache.py @@ -37,11 +37,11 @@ def build_llm_with_lmcache(): '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}') # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. - # Note that LMCache is not compatible with chunked prefill for now. + # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392). llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", kv_transfer_config=ktc, max_model_len=8000, - enable_chunked_prefill=False, + enable_chunked_prefill=True, gpu_memory_utilization=0.8) try: