mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 06:45:01 +08:00
[KV Connector] Update lmcache connector with latest compatibility (#27681)
Signed-off-by: Samuel Shen <slshen@uchicago.edu> Co-authored-by: Samuel Shen <slshen@uchicago.edu>
This commit is contained in:
parent
0d8161b075
commit
a4a4f0f617
@ -44,8 +44,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_integration.utils impo
|
|||||||
)
|
)
|
||||||
from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group
|
from vllm.distributed.parallel_state import get_tensor_model_parallel_rank, get_tp_group
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import get_kv_cache_torch_dtype
|
|
||||||
from vllm.utils.math_utils import cdiv
|
from vllm.utils.math_utils import cdiv
|
||||||
|
from vllm.utils.torch_utils import get_kv_cache_torch_dtype
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.version import __version__ as VLLM_VERSION
|
from vllm.version import __version__ as VLLM_VERSION
|
||||||
|
|
||||||
@ -389,7 +389,7 @@ class ReqMeta:
|
|||||||
|
|
||||||
|
|
||||||
def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig):
|
def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig):
|
||||||
return lmcache_config.enable_pd
|
return not lmcache_config.enable_pd
|
||||||
|
|
||||||
|
|
||||||
def _calculate_mtp_layers(vllm_config, model_config):
|
def _calculate_mtp_layers(vllm_config, model_config):
|
||||||
@ -403,6 +403,20 @@ def _calculate_mtp_layers(vllm_config, model_config):
|
|||||||
num_mtp_layers = getattr(
|
num_mtp_layers = getattr(
|
||||||
model_config.hf_config, "num_nextn_predict_layers", 0
|
model_config.hf_config, "num_nextn_predict_layers", 0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elif vllm_config.speculative_config.use_eagle():
|
||||||
|
try:
|
||||||
|
draft_model_config = vllm_config.speculative_config.draft_model_config
|
||||||
|
num_mtp_layers = draft_model_config.get_num_layers(
|
||||||
|
vllm_config.parallel_config
|
||||||
|
)
|
||||||
|
logger.info("EAGLE detected %d extra layer(s)", num_mtp_layers)
|
||||||
|
except Exception:
|
||||||
|
logger.info(
|
||||||
|
"EAGLE detected, but failed to get the number of extra layers"
|
||||||
|
"falling back to 1"
|
||||||
|
)
|
||||||
|
num_mtp_layers = 1
|
||||||
return num_mtp_layers
|
return num_mtp_layers
|
||||||
|
|
||||||
|
|
||||||
@ -1208,6 +1222,10 @@ class LMCacheConnectorV1Impl:
|
|||||||
if the CacheManager this allocated blocks for us.
|
if the CacheManager this allocated blocks for us.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Clear local status in lookup client when a new request is
|
||||||
|
# successfully scheduled.
|
||||||
|
self.lookup_client.clear_lookup_status(request.request_id)
|
||||||
|
|
||||||
kv_transfer_params = (
|
kv_transfer_params = (
|
||||||
request.kv_transfer_params
|
request.kv_transfer_params
|
||||||
if hasattr(request, "kv_transfer_params")
|
if hasattr(request, "kv_transfer_params")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user