diff --git a/examples/lmcache/disagg_prefill_lmcache_v0.py b/examples/lmcache/disagg_prefill_lmcache_v0.py index 7da6fb7aaa23..66cc94185230 100644 --- a/examples/lmcache/disagg_prefill_lmcache_v0.py +++ b/examples/lmcache/disagg_prefill_lmcache_v0.py @@ -49,9 +49,10 @@ def run_prefill(prefill_done, prompts): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) - ktc = KVTransferConfig.from_cli( - '{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' - ) + ktc = KVTransferConfig(kv_connector="LMCacheConnector", + kv_role="kv_producer", + kv_rank=0, + kv_parallel_size=2) # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", @@ -78,9 +79,10 @@ def run_decode(prefill_done, prompts, timeout=1): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig.from_cli( - '{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' - ) + ktc = KVTransferConfig(kv_connector="LMCacheConnector", + kv_role="kv_consumer", + kv_rank=1, + kv_parallel_size=2) # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # of memory. Reduce the value if your GPU has less memory. llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", diff --git a/examples/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/lmcache/kv_cache_sharing_lmcache_v1.py index af1b4351dd54..7748f8ca6133 100644 --- a/examples/lmcache/kv_cache_sharing_lmcache_v1.py +++ b/examples/lmcache/kv_cache_sharing_lmcache_v1.py @@ -49,8 +49,8 @@ def run_store(store_done, prompts): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig.from_cli( - '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}') + ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", + kv_role="kv_both") # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # memory. Reduce the value if your GPU has less memory. llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", @@ -76,8 +76,8 @@ def run_retrieve(store_done, prompts, timeout=1): sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) - ktc = KVTransferConfig.from_cli( - '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}') + ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", + kv_role="kv_both") # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB # of memory. Reduce the value if your GPU has less memory. llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py index 66efbc0c9dee..11918f72feec 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -16,16 +16,17 @@ except FileNotFoundError: sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) -llm = LLM( - model="meta-llama/Llama-3.2-1B-Instruct", - enforce_eager=True, - gpu_memory_utilization=0.8, - max_num_batched_tokens=64, - max_num_seqs=16, - kv_transfer_config=KVTransferConfig.from_cli( - '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",' - '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}' - )) #, max_model_len=2048, max_num_batched_tokens=2048) +llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", + enforce_eager=True, + gpu_memory_utilization=0.8, + max_num_batched_tokens=64, + max_num_seqs=16, + kv_transfer_config=KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "shared_storage_path": "local_storage" + })) #, max_model_len=2048, max_num_batched_tokens=2048) # 1ST generation (prefill instance) outputs = llm.generate(prompts, sampling_params) diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py index f7cbf6557d54..798128301e0f 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -17,11 +17,12 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True, gpu_memory_utilization=0.8, - kv_transfer_config=KVTransferConfig.from_cli( - '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", ' - '"kv_connector_extra_config": ' - '{"shared_storage_path": "local_storage"}}') - ) #, max_model_len=2048, max_num_batched_tokens=2048) + kv_transfer_config=KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "shared_storage_path": "local_storage" + })) #, max_model_len=2048, max_num_batched_tokens=2048) # 1ST generation (prefill instance) outputs = llm.generate( diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py index d60985146c5c..bb6fdd48f79e 100644 --- a/examples/offline_inference/disaggregated_prefill.py +++ b/examples/offline_inference/disaggregated_prefill.py @@ -32,9 +32,10 @@ def run_prefill(prefill_done): # This instance is the prefill node (kv_producer, rank 0). # The number of parallel instances for KV cache transfer is set to 2, # as required for PyNcclConnector. - ktc = KVTransferConfig.from_cli( - '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' - ) + ktc = KVTransferConfig(kv_connector="PyNcclConnector", + kv_role="kv_producer", + kv_rank=0, + kv_parallel_size=2) # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB # memory. You may need to adjust the value to fit your GPU. @@ -71,9 +72,10 @@ def run_decode(prefill_done): # This instance is the decode node (kv_consumer, rank 1). # The number of parallel instances for KV cache transfer is set to 2, # as required for PyNcclConnector. - ktc = KVTransferConfig.from_cli( - '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' - ) + ktc = KVTransferConfig(kv_connector="PyNcclConnector", + kv_role="kv_consumer", + kv_rank=1, + kv_parallel_size=2) # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB # memory. You may need to adjust the value to fit your GPU.