diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index d29903bf497f4..97dcc42312f68 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -3,6 +3,9 @@
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
 
+# Export Python path
+export PYTHONPATH=".."
+
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
 while true; do
@@ -74,6 +77,15 @@ HF_MOUNT="/root/.cache/huggingface"
 
 commands=$@
 echo "Commands:$commands"
+
+if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
+  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
+fi
+
+if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
+  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
+fi
+
 #ignore certain kernels tests
 if [[ $commands == *" kernels/core"* ]]; then
   commands="${commands} \
@@ -161,6 +173,8 @@ fi
 
 
 PARALLEL_JOB_COUNT=8
+MYPYTHONPATH=".."
+
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
   # assign job count as the number of shards used   
@@ -181,6 +195,7 @@ if [[ $commands == *"--shard-id="* ]]; then
         -e AWS_SECRET_ACCESS_KEY \
         -v "${HF_CACHE}:${HF_MOUNT}" \
         -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
         --name "${container_name}_${GPU}" \
         "${image_name}" \
         /bin/bash -c "${commands_gpu}" \
@@ -211,6 +226,7 @@ else
           -e AWS_SECRET_ACCESS_KEY \
           -v "${HF_CACHE}:${HF_MOUNT}" \
           -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
           --name "${container_name}" \
           "${image_name}" \
           /bin/bash -c "${commands}"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 01d04759f5362..f7e4af4f2af43 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -32,6 +32,7 @@ steps:
 ##### fast check tests  #####
 
 - label: Documentation Build # 2min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/test_docs/docs"
   fast_check: true
   no_gpu: True
@@ -42,6 +43,7 @@ steps:
   - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/mq_llm_engine
@@ -62,6 +64,7 @@ steps:
   - pytest -v -s worker # Worker
 
 - label: Python-only Installation Test
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - tests/standalone_tests/python_only_compile.sh
   - setup.py
@@ -69,7 +72,7 @@ steps:
   - bash standalone_tests/python_only_compile.sh
 
 - label: Basic Correctness Test # 30min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   fast_check: true
   torch_nightly: true
   source_file_dependencies:
@@ -86,6 +89,7 @@ steps:
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
 - label: Chunked Prefill Test
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/basic_correctness/test_chunked_prefill
@@ -94,7 +98,7 @@ steps:
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 
 - label: Core Test # 10min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   fast_check: true
   source_file_dependencies:
   - vllm/core
@@ -104,10 +108,10 @@ steps:
   - pytest -v -s core
 
 - label: Entrypoints Test # 40min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   fast_check: true
   torch_nightly: true
-  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/llm
@@ -126,6 +130,7 @@ steps:
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Distributed Tests (4 GPUs) # 10min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -158,7 +163,7 @@ steps:
   - popd
 
 - label: Metrics, Tracing Test # 10min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   num_gpus: 2
   source_file_dependencies:
   - vllm/
@@ -172,7 +177,7 @@ steps:
 #####  1 GPU test  #####
 
 - label: Regression Test # 5min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/
   - tests/test_regression
@@ -182,7 +187,7 @@ steps:
   working_dir: "/vllm-workspace/tests" # optional
 
 - label: Engine Test # 10min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/
   - tests/engine
@@ -196,7 +201,7 @@ steps:
   - pytest -v -s tokenization
 
 - label: V1 Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
     - tests/v1
@@ -221,8 +226,8 @@ steps:
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
 - label: Examples Test # 25min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/examples"
-  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/entrypoints
   - examples/
@@ -246,7 +251,7 @@ steps:
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/
   - tests/prefix_caching
@@ -254,6 +259,7 @@ steps:
     - pytest -v -s prefix_caching
 
 - label: Samplers Test # 36min
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor/layers
   - vllm/sampling_metadata.py
@@ -264,7 +270,7 @@ steps:
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
 - label: LogitsProcessor Test # 5min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/model_executor/layers
   - vllm/model_executor/guided_decoding
@@ -275,6 +281,7 @@ steps:
     - pytest -v -s model_executor/test_guided_processors.py
 
 - label: Speculative decoding tests # 40min
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/spec_decode
   - tests/spec_decode
@@ -285,7 +292,7 @@ steps:
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
@@ -293,6 +300,7 @@ steps:
   parallelism: 4
 
 - label: PyTorch Compilation Unit Tests
+  mirror_hardwares: [amdexperimental, amdproduction]
   torch_nightly: true
   source_file_dependencies:
     - vllm/
@@ -303,6 +311,7 @@ steps:
     - pytest -v -s compile/test_sequence_parallelism.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
@@ -314,6 +323,7 @@ steps:
   - pytest -v -s compile/piecewise/test_toy_llama.py
 
 - label: PyTorch Fullgraph Test # 18min
+  mirror_hardwares: [amdexperimental, amdproduction]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
@@ -322,7 +332,7 @@ steps:
   - pytest -v -s compile/test_full_graph.py
 
 - label: Kernels Core Operation Test
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - csrc/
   - tests/kernels/core
@@ -330,7 +340,7 @@ steps:
     - pytest -v -s kernels/core
 
 - label: Kernels Attention Test %N
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - csrc/attention/
   - vllm/attention
@@ -341,7 +351,7 @@ steps:
   parallelism: 2
 
 - label: Kernels Quantization Test %N
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - csrc/quantization/
   - vllm/model_executor/layers/quantization
@@ -351,7 +361,7 @@ steps:
   parallelism: 2
 
 - label: Kernels MoE Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/moe/
   - tests/kernels/moe
@@ -360,7 +370,7 @@ steps:
     - pytest -v -s kernels/moe
 
 - label: Kernels Mamba Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/mamba/
   - tests/kernels/mamba
@@ -368,7 +378,7 @@ steps:
     - pytest -v -s kernels/mamba
 
 - label: Tensorizer Test # 11min
-  # mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
@@ -379,14 +389,15 @@ steps:
     - pytest -v -s tensorizer_loader
 
 - label: Benchmarks # 9min
+  mirror_hardwares: [amdexperimental, amdproduction]
   working_dir: "/vllm-workspace/.buildkite"
-  mirror_hardwares: [amd]
   source_file_dependencies:
   - benchmarks/
   commands:
   - bash scripts/run-benchmarks.sh
 
 - label: Benchmarks CLI Test # 10min
+  mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
   - vllm/
   - tests/benchmarks/
@@ -394,6 +405,7 @@ steps:
   - pytest -v -s benchmarks/
 
 - label: Quantization Test
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
@@ -402,6 +414,7 @@ steps:
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - csrc/
@@ -411,6 +424,7 @@ steps:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 
 - label: OpenAI API correctness
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - csrc/
   - vllm/entrypoints/openai/
@@ -419,6 +433,7 @@ steps:
   - pytest -s entrypoints/openai/correctness/
 
 - label: Encoder Decoder tests # 5min
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/encoder_decoder
@@ -426,8 +441,8 @@ steps:
     - pytest -v -s encoder_decoder
 
 - label: OpenAI-Compatible Tool Use # 20 min
+  mirror_hardwares: [amdexperimental]
   fast_check: false
-  #mirror_hardwares: [ amd ]
   source_file_dependencies:
     - vllm/
     - tests/tool_use
@@ -439,6 +454,7 @@ steps:
 #####  models test  #####
 
 - label: Basic Models Test # 24min
+  mirror_hardwares: [amdexperimental]
   torch_nightly: true
   source_file_dependencies:
   - vllm/
@@ -454,7 +470,7 @@ steps:
     - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
 
 - label: Language Models Test (Standard)
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/models/language
@@ -464,6 +480,7 @@ steps:
     - pytest -v -s models/language -m core_model
 
 - label: Language Models Test (Extended)
+  mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
   - vllm/
@@ -474,7 +491,7 @@ steps:
     - pytest -v -s models/language -m 'not core_model'
 
 - label: Multi-Modal Models Test (Standard)
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
@@ -485,6 +502,7 @@ steps:
     - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Models Test (Extended) 1
+  mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
   - vllm/
@@ -494,6 +512,7 @@ steps:
     - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
 
 - label: Multi-Modal Models Test (Extended) 2
+  mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
   - vllm/
@@ -503,6 +522,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 
 - label: Multi-Modal Models Test (Extended) 3
+  mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
   - vllm/
@@ -512,7 +532,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 
 - label: Quantized Models Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/model_executor/layers/quantization
   - tests/models/quantization
@@ -521,7 +541,7 @@ steps:
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   optional: true
   commands:
     - echo 'Testing custom models...'
@@ -533,7 +553,7 @@ steps:
 #####  multi gpus test  #####
 
 - label: Distributed Comm Ops Test # 7min
-  mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental, amdproduction]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
@@ -544,6 +564,7 @@ steps:
   - pytest -v -s distributed/test_shm_broadcast.py
 
 - label: 2 Node Tests (4 GPUs in total) # 16min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   num_nodes: 2
@@ -562,7 +583,7 @@ steps:
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 
 - label: Distributed Tests (2 GPUs) # 40min
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
@@ -599,6 +620,7 @@ steps:
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 
 - label: Plugin Tests (2 GPUs) # 40min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
@@ -618,6 +640,7 @@ steps:
   - pytest -v -s models/test_oot_registration.py # it needs a clean process
 
 - label: Multi-step Tests (4 GPUs) # 36min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -638,6 +661,7 @@ steps:
   - pytest -v -s multi_step/test_correctness_llm.py
 
 - label: Pipeline Parallelism Test # 45min
+  mirror_hardwares: [amdexperimental, amdproduction]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -651,6 +675,7 @@ steps:
   - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: LoRA TP Test (Distributed)
+  mirror_hardwares: [amdexperimental, amdproduction]
   num_gpus: 4
   source_file_dependencies:
   - vllm/lora
@@ -666,6 +691,7 @@ steps:
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
+  mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 52fbf787f1dff..abd4212c6e35d 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -1,3 +1,5 @@
+# Common dependencies
+-r common.txt
 
 # entrypoints test
 # librosa==0.10.2.post1 # required by audio tests in entrypoints/openai