diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index d5d4043a1d5bc..67088caa8150b 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -51,7 +51,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker Test # 10min
   timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -64,7 +64,7 @@ steps:
 - label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -99,7 +99,7 @@ steps:
 - label: Basic Correctness Test # 20min
   timeout_in_minutes: 30
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   fast_check: true
   torch_nightly: true
@@ -116,7 +116,7 @@ steps:
 
 - label: Entrypoints Unit Tests # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   timeout_in_minutes: 10
   working_dir: "/vllm-workspace/tests"
@@ -131,7 +131,7 @@ steps:
 - label: Entrypoints Integration Test (LLM) # 30min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   fast_check: true
@@ -254,7 +254,7 @@ steps:
 
 - label: EPLB Algorithm Test # 5min
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/tests"
@@ -266,7 +266,7 @@ steps:
 
 - label: EPLB Execution Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_4
+  agent_pool: mi355_4
   # grade: Blocking
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
@@ -281,7 +281,7 @@ steps:
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_2
+  agent_pool: mi355_2
   # grade: Blocking
   num_gpus: 2
   source_file_dependencies:
@@ -301,7 +301,7 @@ steps:
 - label: Regression Test # 7min
   timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
   - vllm/
@@ -343,7 +343,7 @@ steps:
 - label: V1 Test entrypoints # 35min
   timeout_in_minutes: 50
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   grade: Blocking
   source_file_dependencies:
     - vllm/
@@ -544,7 +544,7 @@ steps:
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_1
+  agent_pool: mi355_1
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
@@ -715,6 +715,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0
+  - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 15min
@@ -934,6 +935,18 @@ steps:
   commands:
     - pytest -v -s models/language/pooling_mteb_test
 
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
 - label: Multi-Modal Processor Test # 44min
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental]
@@ -1472,14 +1485,14 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
     - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
     - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
-    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####