diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 1fc3dbd8c21f4..6b9c0121c4aa8 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -44,7 +44,6 @@ docker run \
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
     pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+    pytest -v -s v1/test_metrics
     pytest -v -s v1/test_serial_utils.py
-    pytest -v -s v1/test_utils.py
-    pytest -v -s v1/test_metrics_reader.py
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c6c4e2a2309fc..e603c1582e1fb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -159,10 +159,7 @@ steps:
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
-  - tests/v1/test_async_llm_dp.py
-  - tests/v1/test_external_lb_dp.py
-  - tests/v1/test_internal_lb_dp.py
-  - tests/v1/test_hybrid_lb_dp.py
+  - tests/v1/distributed
   - tests/v1/engine/test_engine_core_client.py
   - tests/distributed/test_symm_mem_allreduce.py
   commands:
@@ -180,10 +177,10 @@ steps:
   - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
-  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
@@ -300,12 +297,9 @@ steps:
     - pytest -v -s v1/spec_decode
     - pytest -v -s v1/kv_connector/unit
     - pytest -v -s v1/metrics
-    - pytest -v -s v1/test_kv_sharing.py
-    - pytest -v -s v1/test_metrics_reader.py
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
     - pytest -v -s v1/test_serial_utils.py
-    - pytest -v -s v1/test_utils.py
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -895,14 +889,13 @@ steps:
   - tests/compile/test_wrapper.py
   - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
-  - tests/v1/test_async_llm_dp.py
-  - tests/v1/test_external_lb_dp.py
+  - tests/v1/distributed
   - tests/v1/entrypoints/openai/test_multi_api_servers.py
   - tests/v1/shutdown
   - tests/v1/worker/test_worker_memory_snapshot.py
   commands:
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/core/test_kv_sharing.py
similarity index 100%
rename from tests/v1/test_kv_sharing.py
rename to tests/v1/core/test_kv_sharing.py
diff --git a/tests/v1/distributed/__init__.py b/tests/v1/distributed/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
similarity index 100%
rename from tests/v1/test_async_llm_dp.py
rename to tests/v1/distributed/test_async_llm_dp.py
diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/distributed/test_external_lb_dp.py
similarity index 100%
rename from tests/v1/test_external_lb_dp.py
rename to tests/v1/distributed/test_external_lb_dp.py
diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/distributed/test_hybrid_lb_dp.py
similarity index 99%
rename from tests/v1/test_hybrid_lb_dp.py
rename to tests/v1/distributed/test_hybrid_lb_dp.py
index 552436f818d77..21d8009a6dbb7 100644
--- a/tests/v1/test_hybrid_lb_dp.py
+++ b/tests/v1/distributed/test_hybrid_lb_dp.py
@@ -12,7 +12,7 @@ import pytest_asyncio
 import requests
 
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 from vllm.platforms import current_platform
 
 MODEL_NAME = "ibm-research/PowerMoE-3b"
diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/distributed/test_internal_lb_dp.py
similarity index 99%
rename from tests/v1/test_internal_lb_dp.py
rename to tests/v1/distributed/test_internal_lb_dp.py
index e965645711ee6..3f9defd13dead 100644
--- a/tests/v1/test_internal_lb_dp.py
+++ b/tests/v1/distributed/test_internal_lb_dp.py
@@ -13,7 +13,7 @@ import pytest_asyncio
 import requests
 
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 from vllm.platforms import current_platform
 
 MODEL_NAME = "ibm-research/PowerMoE-3b"
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
index f7c31b0c43778..35f75191d9c8d 100644
--- a/tests/v1/entrypoints/openai/test_multi_api_servers.py
+++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from tests.v1.test_utils import check_request_balancing
+from tests.v1.utils import check_request_balancing
 
 MODEL_NAME = "ibm-research/PowerMoE-3b"
 
diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/metrics/test_metrics_reader.py
similarity index 100%
rename from tests/v1/test_metrics_reader.py
rename to tests/v1/metrics/test_metrics_reader.py
diff --git a/tests/v1/test_utils.py b/tests/v1/utils.py
similarity index 67%
rename from tests/v1/test_utils.py
rename to tests/v1/utils.py
index 00d98a873a310..b3f560c11e8f5 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/utils.py
@@ -1,71 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 import pytest
 import regex as re
 import requests
-import torch
 
 from tests.utils import RemoteOpenAIServer
-from vllm.v1.worker.utils import bind_kv_cache
-
-
-def test_bind_kv_cache():
-    from vllm.attention import Attention
-
-    ctx = {
-        'layers.0.self_attn': Attention(32, 128, 0.1),
-        'layers.1.self_attn': Attention(32, 128, 0.1),
-        'layers.2.self_attn': Attention(32, 128, 0.1),
-        'layers.3.self_attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = {
-        'layers.0.self_attn': torch.zeros((1, )),
-        'layers.1.self_attn': torch.zeros((1, )),
-        'layers.2.self_attn': torch.zeros((1, )),
-        'layers.3.self_attn': torch.zeros((1, )),
-    }
-    runner_kv_caches: list[torch.Tensor] = []
-    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
-    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
-        'layers.0.self_attn']
-    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
-        'layers.1.self_attn']
-    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
-        'layers.2.self_attn']
-    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
-        'layers.3.self_attn']
-
-    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
-    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
-    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
-    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
-
-
-def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
-
-    # example from Jamba PP=2
-    ctx = {
-        'model.layers.20.attn': Attention(32, 128, 0.1),
-        'model.layers.28.attn': Attention(32, 128, 0.1),
-    }
-    kv_cache = {
-        'model.layers.20.attn': torch.zeros((1, )),
-        'model.layers.28.attn': torch.zeros((1, )),
-    }
-
-    runner_kv_caches: list[torch.Tensor] = []
-    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
-
-    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
-        'model.layers.20.attn']
-    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
-        'model.layers.28.attn']
-
-    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
-    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
-
 
 # Prometheus metrics utilities for testing
 
diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py
new file mode 100644
index 0000000000000..fd0e630ce178a
--- /dev/null
+++ b/tests/v1/worker/test_utils.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.v1.worker.utils import bind_kv_cache
+
+
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'layers.0.self_attn': torch.zeros((1, )),
+        'layers.1.self_attn': torch.zeros((1, )),
+        'layers.2.self_attn': torch.zeros((1, )),
+        'layers.3.self_attn': torch.zeros((1, )),
+    }
+    runner_kv_caches: list[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
+        'layers.0.self_attn']
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
+        'layers.1.self_attn']
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
+        'layers.2.self_attn']
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
+        'layers.3.self_attn']
+
+    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
+    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
+    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
+    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
+
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'model.layers.20.attn': torch.zeros((1, )),
+        'model.layers.28.attn': torch.zeros((1, )),
+    }
+
+    runner_kv_caches: list[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
+        'model.layers.20.attn']
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
+        'model.layers.28.attn']
+
+    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
+    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']