mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 07:35:01 +08:00
[Nixl][P/D] Add cuda2cpu support (HD->DH transfer) (#24690)
Signed-off-by: Chenxi Yang <cxyang@fb.com> Co-authored-by: Chenxi Yang <cxyang@fb.com> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
9f78b9ca84
commit
f84b2a0dd0
@ -1,6 +1,31 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -xe
|
set -xe
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
KV_BUFFER_DEVICE="cuda" # Default to cuda
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--kv_buffer_device)
|
||||||
|
KV_BUFFER_DEVICE="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option $1"
|
||||||
|
echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
|
||||||
|
|
||||||
|
# Build the kv-transfer-config once
|
||||||
|
if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then
|
||||||
|
KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||||
|
else
|
||||||
|
KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}"
|
||||||
|
fi
|
||||||
|
|
||||||
# Models to run
|
# Models to run
|
||||||
MODELS=(
|
MODELS=(
|
||||||
"Qwen/Qwen3-0.6B"
|
"Qwen/Qwen3-0.6B"
|
||||||
@ -93,7 +118,7 @@ run_tests_for_model() {
|
|||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--gpu-memory-utilization 0.2 \
|
--gpu-memory-utilization 0.2 \
|
||||||
--tensor-parallel-size $PREFILLER_TP_SIZE \
|
--tensor-parallel-size $PREFILLER_TP_SIZE \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
--kv-transfer-config '$KV_CONFIG'"
|
||||||
|
|
||||||
if [ -n "$model_args" ]; then
|
if [ -n "$model_args" ]; then
|
||||||
FULL_CMD="$BASE_CMD $model_args"
|
FULL_CMD="$BASE_CMD $model_args"
|
||||||
@ -128,7 +153,7 @@ run_tests_for_model() {
|
|||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--gpu-memory-utilization 0.2 \
|
--gpu-memory-utilization 0.2 \
|
||||||
--tensor-parallel-size $DECODER_TP_SIZE \
|
--tensor-parallel-size $DECODER_TP_SIZE \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
--kv-transfer-config '$KV_CONFIG'"
|
||||||
|
|
||||||
if [ -n "$model_args" ]; then
|
if [ -n "$model_args" ]; then
|
||||||
FULL_CMD="$BASE_CMD $model_args"
|
FULL_CMD="$BASE_CMD $model_args"
|
||||||
|
|||||||
35
tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
Normal file → Executable file
35
tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh
Normal file → Executable file
@ -1,6 +1,33 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -xe
|
set -xe
|
||||||
|
|
||||||
|
# Parse command line arguments
|
||||||
|
KV_BUFFER_DEVICE="cuda" # Default to cuda
|
||||||
|
PREFILL_GPU_ID=4 # Default GPU IDs
|
||||||
|
DECODE_GPU_ID=5
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case $1 in
|
||||||
|
--kv_buffer_device)
|
||||||
|
KV_BUFFER_DEVICE="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "Unknown option $1"
|
||||||
|
echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Running edge case tests with kv_buffer_device=$KV_BUFFER_DEVICE (GPUs: $PREFILL_GPU_ID, $DECODE_GPU_ID)"
|
||||||
|
|
||||||
|
# Build the kv-transfer-config once
|
||||||
|
if [[ "$KV_BUFFER_DEVICE" == "cuda" ]]; then
|
||||||
|
KV_CONFIG='{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
|
||||||
|
else
|
||||||
|
KV_CONFIG="{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"$KV_BUFFER_DEVICE\"}"
|
||||||
|
fi
|
||||||
|
|
||||||
# Models to run
|
# Models to run
|
||||||
MODELS=(
|
MODELS=(
|
||||||
"Qwen/Qwen3-0.6B"
|
"Qwen/Qwen3-0.6B"
|
||||||
@ -54,11 +81,11 @@ run_tests_for_model() {
|
|||||||
# Start prefill instance
|
# Start prefill instance
|
||||||
PREFILL_PORT=8001
|
PREFILL_PORT=8001
|
||||||
|
|
||||||
BASE_CMD="CUDA_VISIBLE_DEVICES=0 VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
|
BASE_CMD="CUDA_VISIBLE_DEVICES=$PREFILL_GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=5559 vllm serve $model_name \
|
||||||
--port $PREFILL_PORT \
|
--port $PREFILL_PORT \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--gpu-memory-utilization 0.2 \
|
--gpu-memory-utilization 0.2 \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
--kv-transfer-config '$KV_CONFIG'"
|
||||||
|
|
||||||
if [ -n "$model_args" ]; then
|
if [ -n "$model_args" ]; then
|
||||||
FULL_CMD="$BASE_CMD $model_args"
|
FULL_CMD="$BASE_CMD $model_args"
|
||||||
@ -72,11 +99,11 @@ run_tests_for_model() {
|
|||||||
DECODE_PORT=8002
|
DECODE_PORT=8002
|
||||||
|
|
||||||
# Build the command with or without model-specific args
|
# Build the command with or without model-specific args
|
||||||
BASE_CMD="CUDA_VISIBLE_DEVICES=1 VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
|
BASE_CMD="CUDA_VISIBLE_DEVICES=$DECODE_GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve $model_name \
|
||||||
--port $DECODE_PORT \
|
--port $DECODE_PORT \
|
||||||
--enforce-eager \
|
--enforce-eager \
|
||||||
--gpu-memory-utilization 0.2 \
|
--gpu-memory-utilization 0.2 \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'"
|
--kv-transfer-config '$KV_CONFIG'"
|
||||||
|
|
||||||
if [ -n "$model_args" ]; then
|
if [ -n "$model_args" ]; then
|
||||||
FULL_CMD="$BASE_CMD $model_args"
|
FULL_CMD="$BASE_CMD $model_args"
|
||||||
|
|||||||
@ -28,8 +28,8 @@ class KVTransferConfig:
|
|||||||
"""The engine id for KV transfers."""
|
"""The engine id for KV transfers."""
|
||||||
|
|
||||||
kv_buffer_device: Optional[str] = "cuda"
|
kv_buffer_device: Optional[str] = "cuda"
|
||||||
"""The device used by kv connector to buffer the KV cache.
|
"""The device used by kv connector to buffer the KV cache. Choices are
|
||||||
Currently only support 'cuda'."""
|
'cuda' and 'cpu'."""
|
||||||
|
|
||||||
kv_buffer_size: float = 1e9
|
kv_buffer_size: float = 1e9
|
||||||
"""The buffer size for TorchDistributedConnector. Measured in number of
|
"""The buffer size for TorchDistributedConnector. Measured in number of
|
||||||
|
|||||||
@ -67,7 +67,10 @@ except ImportError:
|
|||||||
# Supported platforms and types of kv transfer buffer.
|
# Supported platforms and types of kv transfer buffer.
|
||||||
# {device: tuple of supported kv buffer types}
|
# {device: tuple of supported kv buffer types}
|
||||||
_NIXL_SUPPORTED_DEVICE = {
|
_NIXL_SUPPORTED_DEVICE = {
|
||||||
"cuda": ("cuda", ),
|
"cuda": (
|
||||||
|
"cuda",
|
||||||
|
"cpu",
|
||||||
|
),
|
||||||
"tpu": ("cpu", ),
|
"tpu": ("cpu", ),
|
||||||
"xpu": ("cpu", ),
|
"xpu": ("cpu", ),
|
||||||
}
|
}
|
||||||
@ -701,6 +704,9 @@ class NixlConnectorWorker:
|
|||||||
|
|
||||||
def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
|
def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
|
||||||
"""Assign copy (d2h, h2d) operations when host buffer is used."""
|
"""Assign copy (d2h, h2d) operations when host buffer is used."""
|
||||||
|
# Set a no-op if the host buffer is not cpu.
|
||||||
|
if self.kv_buffer_device != "cpu":
|
||||||
|
return
|
||||||
assert self.use_host_buffer
|
assert self.use_host_buffer
|
||||||
self.copy_blocks = copy_operation
|
self.copy_blocks = copy_operation
|
||||||
|
|
||||||
|
|||||||
@ -500,6 +500,30 @@ class CudaPlatformBase(Platform):
|
|||||||
"You can use float16 instead by explicitly setting the "
|
"You can use float16 instead by explicitly setting the "
|
||||||
"`dtype` flag in CLI, for example: --dtype=half.")
|
"`dtype` flag in CLI, for example: --dtype=half.")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def insert_blocks_to_device(
|
||||||
|
cls,
|
||||||
|
src_cache: torch.Tensor,
|
||||||
|
dst_cache: torch.Tensor,
|
||||||
|
src_block_indices: torch.Tensor,
|
||||||
|
dst_block_indices: torch.Tensor,
|
||||||
|
) -> None:
|
||||||
|
"""Copy blocks from src_cache to dst_cache on GPU."""
|
||||||
|
_src_cache = src_cache[:, src_block_indices]
|
||||||
|
dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def swap_out_blocks_to_host(
|
||||||
|
cls,
|
||||||
|
src_cache: torch.Tensor,
|
||||||
|
dst_cache: torch.Tensor,
|
||||||
|
src_block_indices: torch.Tensor,
|
||||||
|
dst_block_indices: torch.Tensor,
|
||||||
|
) -> None:
|
||||||
|
"""Copy blocks from GPU to host (CPU)."""
|
||||||
|
_src_cache = src_cache[:, src_block_indices]
|
||||||
|
dst_cache[:, dst_block_indices] = _src_cache.cpu()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def support_hybrid_kv_cache(cls) -> bool:
|
def support_hybrid_kv_cache(cls) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|||||||
@ -4059,10 +4059,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
self.drafter.validate_same_kv_cache_group(kv_cache_config)
|
self.drafter.validate_same_kv_cache_group(kv_cache_config)
|
||||||
|
|
||||||
if has_kv_transfer_group():
|
if has_kv_transfer_group():
|
||||||
get_kv_transfer_group().register_kv_caches(kv_caches)
|
kv_transfer_group = get_kv_transfer_group()
|
||||||
if self.device.type == 'xpu':
|
kv_transfer_group.register_kv_caches(kv_caches)
|
||||||
get_kv_transfer_group().set_host_xfer_buffer_ops(
|
kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
|
||||||
copy_kv_blocks)
|
|
||||||
|
|
||||||
if self.dcp_world_size > 1:
|
if self.dcp_world_size > 1:
|
||||||
layer_names = self.attn_groups[0][0].layer_names
|
layer_names = self.attn_groups[0][0].layer_names
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user