mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 11:15:01 +08:00
[NIXL] fix cpu PD after physical <> logical block_size PR (#28904)
Signed-off-by: Chendi Xue <chendi.xue@intel.com>
This commit is contained in:
parent
e4bb2684bc
commit
c3e2978620
@ -55,7 +55,7 @@ DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128}
|
|||||||
# Find the git repository root directory
|
# Find the git repository root directory
|
||||||
GIT_ROOT=$(git rev-parse --show-toplevel)
|
GIT_ROOT=$(git rev-parse --show-toplevel)
|
||||||
|
|
||||||
SMI_BIN=$(which nvidia-smi || which rocm-smi)
|
SMI_BIN=$(which nvidia-smi || which rocm-smi || echo "")
|
||||||
|
|
||||||
# Trap the SIGINT signal (triggered by Ctrl+C)
|
# Trap the SIGINT signal (triggered by Ctrl+C)
|
||||||
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
|
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
|
||||||
@ -91,8 +91,13 @@ get_model_args() {
|
|||||||
get_num_gpus() {
|
get_num_gpus() {
|
||||||
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
|
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
|
||||||
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
|
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
|
||||||
else
|
elif [[ "$SMI_BIN" == *"rocm"* ]]; then
|
||||||
echo "$($SMI_BIN -l | grep GPU | wc -l)"
|
echo "$($SMI_BIN -l | grep GPU | wc -l)"
|
||||||
|
else
|
||||||
|
# works for non-cuda platforms,
|
||||||
|
# assuming at least 1 device and
|
||||||
|
# let system to decide which card to use
|
||||||
|
echo "1"
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -95,6 +95,7 @@ def install_system_dependencies():
|
|||||||
"meson",
|
"meson",
|
||||||
"libtool",
|
"libtool",
|
||||||
"libtool-bin",
|
"libtool-bin",
|
||||||
|
"pkg-config",
|
||||||
]
|
]
|
||||||
run_command(["apt-get", "update"])
|
run_command(["apt-get", "update"])
|
||||||
run_command(["apt-get", "install", "-y"] + apt_packages)
|
run_command(["apt-get", "install", "-y"] + apt_packages)
|
||||||
|
|||||||
@ -1161,6 +1161,14 @@ class NixlConnectorWorker:
|
|||||||
# to better exploit the memory layout (ie num_blocks is the first dim).
|
# to better exploit the memory layout (ie num_blocks is the first dim).
|
||||||
split_k_and_v = self.kv_topo.split_k_and_v
|
split_k_and_v = self.kv_topo.split_k_and_v
|
||||||
tensor_size_bytes = None
|
tensor_size_bytes = None
|
||||||
|
|
||||||
|
# TODO (NickLucche): Get kernel_block_size in a cleaner way
|
||||||
|
# NHD default "view" for non-MLA cache
|
||||||
|
if self.device_type == "cpu":
|
||||||
|
block_size_position = -2
|
||||||
|
else:
|
||||||
|
block_size_position = -2 if self.use_mla else -3
|
||||||
|
|
||||||
# Enable different block lengths for different layers when MLA is used.
|
# Enable different block lengths for different layers when MLA is used.
|
||||||
self.block_len_per_layer = list[int]()
|
self.block_len_per_layer = list[int]()
|
||||||
self.slot_size_per_layer = list[int]() # HD bytes in kv terms
|
self.slot_size_per_layer = list[int]() # HD bytes in kv terms
|
||||||
@ -1175,9 +1183,7 @@ class NixlConnectorWorker:
|
|||||||
if base_addr in seen_base_addresses:
|
if base_addr in seen_base_addresses:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# TODO (NickLucche): Get kernel_block_size in a cleaner way
|
kernel_block_size = cache.shape[block_size_position]
|
||||||
# NHD default "view" for non-MLA cache
|
|
||||||
kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3]
|
|
||||||
|
|
||||||
if self.block_size != kernel_block_size:
|
if self.block_size != kernel_block_size:
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user