From 2ba60ec7febaa439dd40040018116f55834b4170 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 17 Oct 2025 16:13:31 +0200
Subject: [PATCH] [CI] Nixl integration tests (#27010)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .buildkite/test-pipeline.yaml                 | 11 +++++
 .../nixl_integration/run_accuracy_test.sh     | 20 ++++++----
 .../nixl_integration/test_accuracy.py         |  7 +++-
 .../nixl_integration/toy_proxy_server.py      |  3 +-
 .../tp_config_sweep_accuracy_test.sh          | 40 +++++++++++++++++++
 5 files changed, 72 insertions(+), 9 deletions(-)
 create mode 100755 tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 238b6ef98bf23..69e36f2804c4d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1084,6 +1084,17 @@ steps:
   - tests/weight_loading
   commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+  
+- label: NixlConnector P/D accuracy tests (Distributed) # 30min
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - tests/v1/kv_connector/nixl_integration/
+  commands:
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
+    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 
 
 ##### multi gpus test #####
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index ed6154462bb2b..31d437837dacb 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -34,15 +34,21 @@ else
 fi
 
 # Models to run
-MODELS=(
-    "Qwen/Qwen3-0.6B"
-)
+MODEL_NAMES=${MODEL_NAMES:-}
+if [[ -n "$MODEL_NAMES" ]]; then
+  MODELS=("$MODEL_NAMES")
+else
+  MODELS=(
+      "Qwen/Qwen3-0.6B"
+  )
+fi
 
 # Number of prefill and decode instances to create
 NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
 NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
+GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -130,7 +136,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --gpu-memory-utilization 0.2 \
+    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
 
@@ -171,7 +177,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
-    --gpu-memory-utilization 0.2 \
+    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --tensor-parallel-size $DECODER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
 
@@ -200,7 +206,7 @@ run_tests_for_model() {
   done
 
   # Build the command for the proxy server with all the hosts and ports
-  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
+  PROXY_CMD="python3 ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
 
   # Add all prefill hosts and ports
   PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
@@ -219,7 +225,7 @@ run_tests_for_model() {
 
   # Run lm eval for this model
   echo "Running tests for $model_name"
-  TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+  TEST_MODEL=$model_name python3 -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
 
   # Clean up before running next model
   cleanup_instances
diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
index b301968e5bf84..a70f4caeb9370 100644
--- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py
+++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py
@@ -12,7 +12,12 @@ FILTER = "exact_match,strict-match"
 RTOL = 0.03
 
 # Model-specific expected values
-EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.41, "deepseek-ai/deepseek-vl2-small": 0.59}
+EXPECTED_VALUES = {
+    "Qwen/Qwen3-0.6B": 0.41,
+    "deepseek-ai/deepseek-vl2-small": 0.59,
+    "deepseek-ai/deepseek-vl2-tiny": 0.19,
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": 0.65,
+}
 
 SIMPLE_PROMPT = (
     "The best part about working on vLLM is that I got to meet so many people across "
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
index 37d70510fe256..5768fcdb57ceb 100644
--- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
+++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -76,7 +76,8 @@ def parse_args():
     parser = argparse.ArgumentParser()
 
     parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--host", type=str, default="localhost")
+    # Always use 127.0.0.1 as localhost binds to IPv6 which is blocked on CI
+    parser.add_argument("--host", type=str, default="127.0.0.1")
 
     # For prefiller instances
     parser.add_argument(
diff --git a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
new file mode 100755
index 0000000000000..537764aafc13f
--- /dev/null
+++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Utility to run integration tests sequentially with varying TP configurations.
+SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh"
+
+# Define test configurations
+configs=(
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
+  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
+)
+
+run_tests() {
+  local label=$1
+  local extra_env=$2
+
+  echo "=== Running tests (${label}) ==="
+  for cfg in "${configs[@]}"; do
+    echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}"
+    # Use 'env' to safely set variables without eval
+    if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then
+      echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}"
+      exit 1
+    fi
+  done
+  echo "✅ All ${label} tests passed!"
+}
+
+# Run tests
+run_tests "default backend" ""
+
+# Check if FLASHINFER is set (non-empty)
+if [[ -n "${FLASHINFER:-}" ]]; then
+  echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER"
+  run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER"
+else
+  echo "FLASHINFER not set, skipping FLASHINFER runs."
+fi