diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh index 87c9a105e9363..453ccc81eb14a 100755 --- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh +++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh @@ -55,7 +55,7 @@ DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128} # Find the git repository root directory GIT_ROOT=$(git rev-parse --show-toplevel) -SMI_BIN=$(which nvidia-smi || which rocm-smi) +SMI_BIN=$(which nvidia-smi || which rocm-smi || echo "") # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -91,8 +91,13 @@ get_model_args() { get_num_gpus() { if [[ "$SMI_BIN" == *"nvidia"* ]]; then echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" - else + elif [[ "$SMI_BIN" == *"rocm"* ]]; then echo "$($SMI_BIN -l | grep GPU | wc -l)" + else + # works for non-cuda platforms, + # assuming at least 1 device and + # let system to decide which card to use + echo "1" fi } diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py index a786abba95ad9..b8a55c615426e 100644 --- a/tools/install_nixl_from_source_ubuntu.py +++ b/tools/install_nixl_from_source_ubuntu.py @@ -95,6 +95,7 @@ def install_system_dependencies(): "meson", "libtool", "libtool-bin", + "pkg-config", ] run_command(["apt-get", "update"]) run_command(["apt-get", "install", "-y"] + apt_packages) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 5ff95876ef34d..1626f819af8b5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1161,6 +1161,14 @@ class NixlConnectorWorker: # to better exploit the memory layout (ie num_blocks is the first dim). split_k_and_v = self.kv_topo.split_k_and_v tensor_size_bytes = None + + # TODO (NickLucche): Get kernel_block_size in a cleaner way + # NHD default "view" for non-MLA cache + if self.device_type == "cpu": + block_size_position = -2 + else: + block_size_position = -2 if self.use_mla else -3 + # Enable different block lengths for different layers when MLA is used. self.block_len_per_layer = list[int]() self.slot_size_per_layer = list[int]() # HD bytes in kv terms @@ -1175,9 +1183,7 @@ class NixlConnectorWorker: if base_addr in seen_base_addresses: continue - # TODO (NickLucche): Get kernel_block_size in a cleaner way - # NHD default "view" for non-MLA cache - kernel_block_size = cache.shape[-2] if self.use_mla else cache.shape[-3] + kernel_block_size = cache.shape[block_size_position] if self.block_size != kernel_block_size: logger.info_once(