Merge branch 'main' into Add_support_for_openpangu_promoe_v2

2026-07-07 14:27:21 +08:00 · 2025-12-24 17:25:37 -05:00 · 2025-12-24 17:25:37 -05:00 · 6304606fad
commit 6304606fad
parent 1aff6fff56 09dc7c690c
183 changed files with 3323 additions and 2373 deletions
--- a/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on chartqa for vllm.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.9
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install "lm-eval[api]>=0.4.9.2"

 usage() {
    echo``
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off "lm-eval[api]>=0.4.9.2" \
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 echo "--- Python dependencies installed ---"

--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -162,7 +162,10 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
+  # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
+  # TODO: Remove after next torch update
+  - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
  - pytest -v -s entrypoints/test_chat_utils.py

 - label: Entrypoints Integration Test (API Server 2)
@ -219,6 +222,9 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
+  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+  # TODO: Remove when the bug is fixed in a future ROCm release
+  - export TORCH_NCCL_BLOCKING_WAIT=1
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
@ -267,9 +273,10 @@ steps:
  - vllm/v1/executor/uniproc_executor.py
  - vllm/v1/worker/gpu_worker.py
  commands:
-  # https://github.com/NVIDIA/nccl/issues/1838
-  #- export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and dp=4 with ep
+  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+  # TODO: Remove when the bug is fixed in a future ROCm release
+  - export TORCH_NCCL_BLOCKING_WAIT=1
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep

 - label: EPLB Algorithm Test # 5min
@ -979,7 +986,10 @@ steps:
    - export MIOPEN_DEBUG_CONV_GEMM=0
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
+    # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
+    # TODO: Remove after next torch update
+    - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work

 - label: Multi-Modal Accuracy Eval (Small Models) # 5min
@ -1288,6 +1298,9 @@ steps:
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
+  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+  # TODO: Remove when the bug is fixed in a future ROCm release
+  - export TORCH_NCCL_BLOCKING_WAIT=1
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@ -1341,7 +1354,9 @@ steps:
  # end platform plugin tests
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
+  # TODO: Remove after next torch update
+  - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
@ -1510,7 +1525,7 @@ steps:
    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
    - pytest -v -s tests/v1/distributed/test_dbo.py

 ##### B200 test #####
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -9,16 +9,6 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping);

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping);
-
-void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
-                     const torch::Tensor& block_mapping);
-
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(

 }  // namespace vllm

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
-                 const torch::Tensor& block_mapping) {
-  int num_layers = key_caches.size();
-  TORCH_CHECK(num_layers == value_caches.size());
-  if (num_layers == 0) {
-    return;
-  }
-  torch::Device cache_device = key_caches[0].device();
-  TORCH_CHECK(cache_device.is_cuda());
-
-  // Create data structures for the kernel.
-  // Create an array of pointers to the key and value caches.
-  int64_t key_cache_ptrs[num_layers];
-  int64_t value_cache_ptrs[num_layers];
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    key_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
-    value_cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
-  }
-
-  // block_mapping is a 2D tensor with shape (num_pairs, 2).
-  int num_pairs = block_mapping.size(0);
-
-  // Move the data structures to the GPU.
-  // NOTE: This synchronizes the CPU and GPU.
-  torch::Tensor key_cache_ptrs_tensor =
-      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
-          .to(cache_device);
-  torch::Tensor value_cache_ptrs_tensor =
-      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
-          .to(cache_device);
-
-  // Launch the kernel.
-  const int numel_per_block = key_caches[0][0].numel();
-  dim3 grid(num_layers, num_pairs);
-  dim3 block(std::min(1024, numel_per_block));
-  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
-        vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            key_cache_ptrs_tensor.data_ptr<int64_t>(),
-            value_cache_ptrs_tensor.data_ptr<int64_t>(),
-            block_mapping.data_ptr<int64_t>(), numel_per_block);
-      }));
-}
-
-// copy blocks kernel for MLA (assumes a joint KV-cache)
-void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
-                     const torch::Tensor& block_mapping) {
-  int num_layers = kv_caches.size();
-  if (num_layers == 0) {
-    return;
-  }
-  torch::Device cache_device = kv_caches[0].device();
-  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
-
-  std::vector<int64_t> cache_ptrs(num_layers);
-  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
-    cache_ptrs[layer_idx] =
-        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
-  }
-  torch::Tensor cache_ptrs_tensor =
-      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
-          .to(cache_device);
-
-  int num_pairs = block_mapping.size(0);
-  // We use the stride instead of numel in case the cache is padded for memory
-  // alignment reasons, we assume the blocks data (inclusive of any padding)
-  // is contiguous in memory
-  int mem_footprint_per_block = kv_caches[0].stride(0);
-  dim3 grid(num_layers, num_pairs);
-  dim3 block(std::min(1024, mem_footprint_per_block));
-  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
-      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
-        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            cache_ptrs_tensor.data_ptr<int64_t>(),
-            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
-      }));
-}
-
 namespace vllm {

 // Used to copy/convert one element
@ -539,9 +451,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
  for (int i = 0; i < VEC_SIZE; i++) {
    amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
  }
-#ifndef USE_ROCM
-  __syncwarp();
-#endif

  // Reduced amax
  for (int mask = 16; mask > 0; mask /= 2) {
@ -551,9 +460,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
    amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
 #endif
  }
-#ifndef USE_ROCM
-  __syncwarp();
-#endif
+
 #if defined(__gfx942__)
  float scale = fmaxf(amax, 1e-4) / 224.0f;
 #else
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@ -24,6 +24,8 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
+  TORCH_CHECK(omp_cpu_mask != nullptr,
+              "Failed to parse CPU string: " + cpu_ids);
  TORCH_CHECK(omp_cpu_mask->size > 0);
  std::vector<int> omp_cpu_ids;
  omp_cpu_ids.reserve(omp_cpu_mask->size);
@ -44,20 +46,12 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {

  // Memory node binding
  if (numa_available() != -1) {
-    int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
    std::set<int> node_ids;
    for (const auto& cpu_id : omp_cpu_ids) {
      int node_id = numa_node_of_cpu(cpu_id);
      if (node_id != -1) {
        node_ids.insert(node_id);
      }
-      if (node_id != mem_node_id) {
-        TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
-                   omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
-                   ". All CPUs should be on the same NUMA node for optimal "
-                   "performance. Memory will be bound to NUMA node ",
-                   mem_node_id, ".");
-      }
    }
    // Concatenate all node_ids into a single comma-separated string
    if (!node_ids.empty()) {
@ -70,7 +64,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
      }

      bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
-      bitmask* src_mask = numa_get_membind();
+      bitmask* src_mask = numa_get_mems_allowed();

      int pid = getpid();

@ -83,15 +77,46 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
                     std::to_string(errno));
        }

-        // restrict memory allocation node.
-        numa_set_membind(mask);
+        // Restrict memory allocation to the selected NUMA node(s).
+        // Enhances memory locality for the threads bound to those NUMA CPUs.
+        if (node_ids.size() > 1) {
+          errno = 0;
+          numa_set_interleave_mask(mask);
+          if (errno != 0) {
+            TORCH_WARN("numa_set_interleave_mask failed. errno: " +
+                       std::to_string(errno));
+          } else {
+            TORCH_WARN(
+                "NUMA binding: Using INTERLEAVE policy for memory "
+                "allocation across multiple NUMA nodes (nodes: " +
+                node_ids_str +
+                "). Memory allocations will be "
+                "interleaved across the specified NUMA nodes.");
+          }
+        } else {
+          errno = 0;
+          numa_set_membind(mask);
+          if (errno != 0) {
+            TORCH_WARN("numa_set_membind failed. errno: " +
+                       std::to_string(errno));
+          } else {
+            TORCH_WARN(
+                "NUMA binding: Using MEMBIND policy for memory "
+                "allocation on the NUMA nodes (" +
+                node_ids_str +
+                "). Memory allocations will be "
+                "strictly bound to these NUMA nodes.");
+          }
+        }
+
        numa_set_strict(1);

        numa_free_nodemask(mask);
        numa_free_nodemask(src_mask);
      } else {
-        TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
-                   std::to_string(errno));
+        TORCH_WARN(
+            "numa_parse_nodestring or numa_get_run_node_mask failed. errno: " +
+            std::to_string(errno));
      }
    }
  }
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@ -35,7 +35,7 @@ template <typename Int>
 __host__ __device__ inline Int round_up(Int x, Int y) {
  static_assert(std::is_integral_v<Int>,
                "round_up argument must be integral type");
-  return (x + y - 1) / y * y;
+  return ((x + y - 1) / y) * y;
 }

 // Compute effective rows for grid configuration with swizzled SF layouts.
@ -61,37 +61,47 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
  int sf_m = round_up<int>(numRows, 128);
  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
-  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
-    // Each thread writes 4 uint32_t elements.
-    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
-         col += blockDim.x * 4) {
-      SFout[row * sf_n_int + col] = 0x00;
-    }
-  }
+  int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;

  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];

-  // Input tensor row/col loops.
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+  // Iterate over all rows and cols including padded ones -
+  //  ensures we visit every single scale factor address to initialize it.
+  for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x;
+         colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
         colIdx += blockDim.x) {
+      int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
+
+      PackedVec in_vec;
      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
-      // Get the output tensor offset.
-      // Same as inOffset because 8 elements are packed into one uint32_t.
-      int64_t outOffset = inOffset;
-      auto& out_pos = out[outOffset];
+
+      // If we are outside valid rows OR outside valid columns -> Use Zeros
+      if (rowIdx >= numRows || elem_idx >= numCols) {
+        memset(&in_vec, 0, sizeof(PackedVec));
+
+      } else {
+        // Valid Region: Load actual data
+        in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      }

      auto sf_out =
          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                             CVT_FP4_NUM_THREADS_PER_SF>(
              rowIdx, colIdx, numKTiles, SFout);

-      out_pos =
+      auto out_val =
          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
+
+      // We do NOT write output for padding because the 'out' tensor is not
+      // padded.
+      if (rowIdx < numRows && elem_idx < numCols) {
+        // Same as inOffset because 8 elements are packed into one uint32_t.
+        out[inOffset] = out_val;
+      }
    }
  }
 }
@ -134,4 +144,4 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
        m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
        reinterpret_cast<uint32_t*>(sf_out));
  });
-}
+}
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -685,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);

-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "Tensor block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
-
-  cache_ops.def(
-      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
-  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
-
  // Reshape the key and value tensors and cache them.
  cache_ops.def(
      "reshape_and_cache(Tensor key, Tensor value,"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -183,7 +183,7 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads

 ARG USE_SCCACHE
-ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
+ARG SCCACHE_DOWNLOAD_URL
 ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
@ -201,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
+        && case "${TARGETPLATFORM}" in \
+          linux/arm64) SCCACHE_ARCH="aarch64" ;; \
+          linux/amd64) SCCACHE_ARCH="x86_64" ;; \
+          *) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
+        esac \
+        && export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
--- a/docs/api/README.md
+++ b/docs/api/README.md
@ -72,7 +72,6 @@ Internal data structures.
 - [vllm.multimodal.inputs.MultiModalFieldConfig][]
 - [vllm.multimodal.inputs.MultiModalKwargsItem][]
 - [vllm.multimodal.inputs.MultiModalKwargsItems][]
- [vllm.multimodal.inputs.MultiModalKwargs][]
 - [vllm.multimodal.inputs.MultiModalInputs][]

 ### Data Parsing
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@ -2,4 +2,4 @@

 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.

-Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
+You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
--- a/docs/deployment/integrations/llm-d.md
+++ b/docs/deployment/integrations/llm-d.md
@ -0,0 +1,5 @@
+# llm-d
+
+vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
+
+You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:

 - [Helm](frameworks/helm.md)
 - [InftyAI/llmaz](integrations/llmaz.md)
+- [llm-d](integrations/llm-d.md)
 - [KAITO](integrations/kaito.md)
 - [KServe](integrations/kserve.md)
 - [Kthena](integrations/kthena.md)
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@ -84,7 +84,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 Install `vllm` and `lm-evaluation-harness` for evaluation:

 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```

 Load and run the model in `vllm`:
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@ -18,7 +18,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```

 ## Quantization Process
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@ -23,7 +23,7 @@ pip install llmcompressor
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```

 ## Quantization Process
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@ -20,7 +20,7 @@ for more installation details.
 Additionally, install `vllm` and `lm-evaluation-harness` for evaluation:

 ```bash
-pip install vllm git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+pip install vllm "lm-eval[api]>=0.4.9.2"
 ```

 ## Quantization Process
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.5 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
+lm-eval[api]>=0.4.9.2 # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
 transformers==4.57.3
 tokenizers==0.22.0
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@ -58,7 +58,7 @@ schemathesis==3.39.15
    # OpenAI schema test

 # Evaluation and benchmarking
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+lm-eval[api]>=0.4.9.2
 jiwer==4.0.0

 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
--- a/requirements/test.in
+++ b/requirements/test.in
@ -34,8 +34,7 @@ num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-# TODO: Use lm-eval[api]==0.4.10 once released
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
+lm-eval[api]>=0.4.9.2 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
 transformers==4.57.3
 tokenizers==0.22.0
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -441,7 +441,7 @@ lightning-utilities==0.14.3
    #   torchmetrics
 llvmlite==0.44.0
    # via numba
-lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+lm-eval==0.4.9.2
    # via -r requirements/test.in
 lxml==5.3.0
    # via
--- a/setup.py
+++ b/setup.py
@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
        sys.platform,
    )
    VLLM_TARGET_DEVICE = "empty"
-elif (
-    sys.platform.startswith("linux")
-    and torch.version.cuda is None
-    and os.getenv("VLLM_TARGET_DEVICE") is None
-    and torch.version.hip is None
-):
-    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
-    # fallback to cpu
-    VLLM_TARGET_DEVICE = "cpu"
+elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
+    if torch.version.hip is not None:
+        VLLM_TARGET_DEVICE = "rocm"
+        logger.info("Auto-detected ROCm")
+    elif torch.version.cuda is not None:
+        VLLM_TARGET_DEVICE = "cuda"
+        logger.info("Auto-detected CUDA")
+    else:
+        VLLM_TARGET_DEVICE = "cpu"


 def is_sccache_available() -> bool:
@ -108,20 +108,26 @@ class cmake_build_ext(build_ext):
                num_jobs = os.cpu_count()

        nvcc_threads = None
-        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
-            # `nvcc_threads` is either the value of the NVCC_THREADS
-            # environment variable (if defined) or 1.
-            # when it is set, we reduce `num_jobs` to avoid
-            # overloading the system.
-            nvcc_threads = envs.NVCC_THREADS
-            if nvcc_threads is not None:
-                nvcc_threads = int(nvcc_threads)
-                logger.info(
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
-                )
-            else:
-                nvcc_threads = 1
-            num_jobs = max(1, num_jobs // nvcc_threads)
+        if _is_cuda() and CUDA_HOME is not None:
+            try:
+                nvcc_version = get_nvcc_cuda_version()
+                if nvcc_version >= Version("11.2"):
+                    # `nvcc_threads` is either the value of the NVCC_THREADS
+                    # environment variable (if defined) or 1.
+                    # when it is set, we reduce `num_jobs` to avoid
+                    # overloading the system.
+                    nvcc_threads = envs.NVCC_THREADS
+                    if nvcc_threads is not None:
+                        nvcc_threads = int(nvcc_threads)
+                        logger.info(
+                            "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                            nvcc_threads,
+                        )
+                    else:
+                        nvcc_threads = 1
+                    num_jobs = max(1, num_jobs // nvcc_threads)
+            except Exception as e:
+                logger.warning("Failed to get NVCC version: %s", e)

        return num_jobs, nvcc_threads

@ -199,9 +205,9 @@ class cmake_build_ext(build_ext):
            # Default build tool to whatever cmake picks.
            build_tool = []
        # Make sure we use the nvcc from CUDA_HOME
-        if _is_cuda():
+        if _is_cuda() and CUDA_HOME is not None:
            cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
-        elif _is_hip():
+        elif _is_hip() and ROCM_HOME is not None:
            cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]

        other_cmake_args = os.environ.get("CMAKE_ARGS")
@ -339,6 +345,89 @@ class precompiled_wheel_utils:
            wheels = json.loads(resp.read().decode("utf-8"))
        return wheels, repo_url

+    @staticmethod
+    def is_rocm_system() -> bool:
+        """Detect ROCm without relying on torch (for build environment)."""
+        if os.getenv("ROCM_PATH"):
+            return True
+        if os.path.isdir("/opt/rocm"):
+            return True
+        if which("rocminfo") is not None:
+            return True
+        try:
+            import torch
+
+            return torch.version.hip is not None
+        except ImportError:
+            return False
+
+    @staticmethod
+    def find_local_rocm_wheel() -> str | None:
+        """Search for a local vllm wheel in common locations."""
+        import glob
+
+        for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
+            wheels = glob.glob(pattern)
+            if wheels:
+                return sorted(wheels)[-1]
+        return None
+
+    @staticmethod
+    def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
+        """Fetch the latest wheel URL from a PyPI-style simple index."""
+        import platform
+        from html.parser import HTMLParser
+        from urllib.parse import urljoin
+        from urllib.request import urlopen
+
+        arch = platform.machine()
+
+        class WheelLinkParser(HTMLParser):
+            def __init__(self):
+                super().__init__()
+                self.wheels = []
+
+            def handle_starttag(self, tag, attrs):
+                if tag == "a":
+                    for name, value in attrs:
+                        if name == "href" and value.endswith(".whl"):
+                            self.wheels.append(value)
+
+        simple_url = f"{index_url.rstrip('/')}/{package}/"
+        print(f"Fetching wheel list from {simple_url}")
+        with urlopen(simple_url) as resp:
+            html = resp.read().decode("utf-8")
+
+        parser = WheelLinkParser()
+        parser.feed(html)
+
+        for wheel in reversed(parser.wheels):
+            if arch in wheel:
+                if wheel.startswith("http"):
+                    return wheel
+                return urljoin(simple_url, wheel)
+
+        raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
+
+    @staticmethod
+    def determine_wheel_url_rocm() -> tuple[str, str | None]:
+        """Determine the precompiled wheel for ROCm."""
+        # Search for local wheel first
+        local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
+        if local_wheel is not None:
+            print(f"Found local ROCm wheel: {local_wheel}")
+            return local_wheel, None
+
+        # Fall back to AMD's PyPI index
+        index_url = os.getenv(
+            "VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
+        )
+        print(f"Fetching ROCm precompiled wheel from {index_url}")
+        wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
+        download_filename = wheel_url.split("/")[-1].split("#")[0]
+        print(f"Using ROCm precompiled wheel: {wheel_url}")
+        return wheel_url, download_filename
+
    @staticmethod
    def determine_wheel_url() -> tuple[str, str | None]:
        """
@ -359,6 +448,11 @@ class precompiled_wheel_utils:
            print(f"Using user-specified precompiled wheel location: {wheel_location}")
            return wheel_location, None
        else:
+            # ROCm: use local wheel or AMD's PyPI index
+            # TODO: When we have ROCm nightly wheels, we can update this logic.
+            if precompiled_wheel_utils.is_rocm_system():
+                return precompiled_wheel_utils.determine_wheel_url_rocm()
+
            import platform

            arch = platform.machine()
@ -465,6 +559,8 @@ class precompiled_wheel_utils:
                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                    "vllm/cumem_allocator.abi3.so",
+                    # ROCm-specific libraries
+                    "vllm/_rocm_C.abi3.so",
                ]

                flash_attn_regex = re.compile(
@ -601,6 +697,8 @@ def get_rocm_version():
    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
    try:
+        if ROCM_HOME is None:
+            return None
        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
        if not librocm_core_file.is_file():
            return None
@ -745,7 +843,9 @@ if _is_hip():

 if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+    if envs.VLLM_USE_PRECOMPILED or (
+        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
+    ):
        # FA3 requires CUDA 12.3 or later
        ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
        # Optional since this doesn't get built (produce an .so file) when
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -410,7 +410,7 @@ class HfRunner:

        # don't put this import at the top level
        # it will call torch.cuda.device_count()
-        from transformers import AutoProcessor  # noqa: F401
+        from transformers import AutoProcessor

        self.processor = AutoProcessor.from_pretrained(
            model_name,
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -511,6 +511,16 @@ def test_human_readable_model_len():
    args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
    assert args.max_model_len == 10212345123456

+    # Special value -1 for auto-fit to GPU memory
+    args = parser.parse_args(["--max-model-len", "-1"])
+    assert args.max_model_len == -1
+
+    # 'auto' is an alias for -1
+    args = parser.parse_args(["--max-model-len", "auto"])
+    assert args.max_model_len == -1
+    args = parser.parse_args(["--max-model-len", "AUTO"])
+    assert args.max_model_len == -1
+
    # Invalid (do not allow decimals with binary multipliers)
    for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
        with pytest.raises(ArgumentError):
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@ -5,6 +5,30 @@ import pytest
 from vllm.assets.audio import AudioAsset


+def add_attention_backend(server_args, attention_config):
+    """Append attention backend CLI arg if specified.
+
+    Args:
+        server_args: List of server arguments to extend in-place.
+        attention_config: Dict with 'backend' key, or None.
+    """
+    if attention_config and "backend" in attention_config:
+        server_args.extend(["--attention-backend", attention_config["backend"]])
+
+
+@pytest.fixture(scope="module")
+def rocm_aiter_fa_attention():
+    """Return attention config for transcription/translation tests on ROCm.
+
+    On ROCm, audio tests require ROCM_AITER_FA attention backend.
+    """
+    from vllm.platforms import current_platform
+
+    if current_platform.is_rocm():
+        return {"backend": "ROCM_AITER_FA"}
+    return None
+
+
@pytest.fixture
 def mary_had_lamb():
    path = AudioAsset("mary_had_lamb").get_local_path()
--- a/tests/entrypoints/openai/test_async_tokenization.py
+++ b/tests/entrypoints/openai/test_async_tokenization.py
@ -15,7 +15,7 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"


@pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio

 from vllm.assets.audio import AudioAsset
-from vllm.multimodal.utils import encode_audio_base64, fetch_audio
+from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio

 from ...utils import RemoteOpenAIServer

@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
    }


+@pytest.fixture(scope="session")
+def url_encoded_audio() -> dict[str, str]:
+    return {
+        audio_url: encode_audio_url(*fetch_audio(audio_url))
+        for audio_url in TEST_AUDIO_URLS
+    }
+
+
 def dummy_messages_from_audio_url(
    audio_urls: str | list[str],
    content_text: str = "What's happening in this audio?",
@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
    audio_url: str,
-    base64_encoded_audio: dict[str, str],
+    url_encoded_audio: dict[str, str],
 ):
-    messages = dummy_messages_from_audio_url(
-        f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
-    )
+    messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])

    # test single completion
    chat_completion = await client.chat.completions.create(
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -28,7 +28,7 @@ def zephyr_lora_files():


@pytest.fixture(scope="module")
-def server(zephyr_lora_files):  # noqa: F811
+def server(zephyr_lora_files):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
        {"role": "system", "content": "you are a helpful assistant"},
        {"role": "user", "content": "what is 1+1?"},
    ]
-
    # test single completion
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=5,
        logprobs=True,
        top_logprobs=5,
    )
@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
    assert len(chat_completion.choices) == 1

    choice = chat_completion.choices[0]
+
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47
+        completion_tokens=5, prompt_tokens=37, total_tokens=42
    )

    message = choice.message
-    assert message.content is not None and len(message.content) >= 10
+    assert message.content is not None and len(message.content) >= 5
    assert message.role == "assistant"
    messages.append({"role": "assistant", "content": message.content})

@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=5,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/test_chat_error.py
+++ b/tests/entrypoints/openai/test_chat_error.py
@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorRespons
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM

 MODEL_NAME = "openai-community/gpt2"
--- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@ -12,7 +12,7 @@ MODEL_NAME = "Qwen/QwQ-32B"


@pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
    args = [
        "--max-model-len",
        "8192",
--- a/tests/entrypoints/openai/test_completion_error.py
+++ b/tests/entrypoints/openai/test_completion_error.py
@ -13,7 +13,7 @@ from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.outputs import CompletionOutput, RequestOutput
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.tokenizers import get_tokenizer
 from vllm.v1.engine.async_llm import AsyncLLM

 MODEL_NAME = "openai-community/gpt2"
--- a/tests/entrypoints/openai/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/test_completion_with_function_calling.py
@ -125,7 +125,7 @@ messages = [


@pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
@ -212,7 +212,7 @@ async def test_function_tool_use(


@pytest.fixture(scope="module")
-def k2_server():  # noqa: F811
+def k2_server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_default_mm_loras.py
+++ b/tests/entrypoints/openai/test_default_mm_loras.py
@ -23,7 +23,7 @@ ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original


@pytest.fixture(scope="module")
-def multimodal_server():  # noqa: F811
+def multimodal_server():
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@ -8,7 +8,7 @@ from ...utils import RemoteOpenAIServer


@pytest.fixture(scope="module")
-def chat_server_with_force_include_usage(request):  # noqa: F811
+def chat_server_with_force_include_usage(request):
    args = [
        # use half precision for speed and memory savings in CI environment
        "--dtype",
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@ -61,13 +61,13 @@ class MockLoRAResolver(LoRAResolver):
            return LoRARequest(
                lora_name="test-lora",
                lora_int_id=1,
-                lora_local_path="/fake/path/test-lora",
+                lora_path="/fake/path/test-lora",
            )
        elif lora_name == "invalid-lora":
            return LoRARequest(
                lora_name="invalid-lora",
                lora_int_id=2,
-                lora_local_path="/fake/path/invalid-lora",
+                lora_path="/fake/path/invalid-lora",
            )
        return None

--- a/tests/entrypoints/openai/test_messages.py
+++ b/tests/entrypoints/openai/test_messages.py
@ -11,7 +11,7 @@ MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
-def server():  # noqa: F811
+def server():
    args = [
        "--max-model-len",
        "2048",
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
        "2",
        *passed_params,
    ]
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_code_interpreter(client: OpenAI, model_name: str):
-    response = await client.responses.create(
+    # Code interpreter may need more time for container init + code execution
+    timeout_value = client.timeout * 3
+    client_with_timeout = client.with_options(timeout=timeout_value)
+
+    response = await client_with_timeout.responses.create(
        model=model_name,
        # TODO: Ideally should be able to set max tool calls
        # to prevent multi-turn, but it is not currently supported
@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.flaky(reruns=3)
 async def test_function_call_with_previous_input_messages(
    client: OpenAI, model_name: str
 ):
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@ -37,7 +37,7 @@ def default_server_args(qwen3_lora_files):


@pytest.fixture(scope="module")
-def server_fixture(request, default_server_args):  # noqa: F811
+def server_fixture(request, default_server_args):
    use_server_flag = request.param
    if use_server_flag:
        args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
--- a/tests/entrypoints/openai/test_serving_tokens.py
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
        add_generation_prompt=True,
        enable_thinking=False,  # default with Qwen3
    )
+
    for ignore_eos in [True, False]:
        payload = {
            "model": MODEL_NAME,
@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
        }
        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
        generate_data = generate_resp.json()
-        generate_res = tokenizer.decode(
-            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
-        )
+        gen_token_ids = generate_data["choices"][0]["token_ids"]
+        generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)

        payload = {
            "model": MODEL_NAME,
@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
            "temperature": 0.0,
            "stream": False,
            "ignore_eos": ignore_eos,
-            "chat_template_kwargs": dict(enable_thinking=False),
+            "chat_template_kwargs": {"enable_thinking": False},
        }
        completions_resp = await client.post("/v1/chat/completions", json=payload)
        completions_data = completions_resp.json()
        completions_res = completions_data["choices"][0]["message"]["content"]

+        if ignore_eos:
+            # When ignoring EOS, only compare up to the first EOS token
+            # Post-EOS generation is undefined and may differ
+            eos_tokens = {
+                tokenizer.eos_token_id,
+                *tokenizer.additional_special_tokens_ids,
+            }
+            # Find first EOS in generated tokens
+            eos_pos = None
+            for i, tid in enumerate(gen_token_ids):
+                if tid in eos_tokens:
+                    eos_pos = i
+                    break
+            if eos_pos is not None:
+                gen_token_ids_truncated = gen_token_ids[:eos_pos]
+                generate_res = tokenizer.decode(
+                    gen_token_ids_truncated, skip_special_tokens=True
+                )
+                # Truncate completions_res to same length for comparison
+                completions_res = completions_res[: len(generate_res)]
+
        assert generate_res == completions_res


--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@ -9,10 +9,16 @@ import time
 import openai
 import pytest

+from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port

 MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"

+# GPU initialization might take take longer
+_IS_ROCM = current_platform.is_rocm()
+_SERVER_STARTUP_TIMEOUT = 120
+_PROCESS_EXIT_TIMEOUT = 15
+

@pytest.mark.asyncio
 async def test_shutdown_on_engine_failure():
@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure():
            "2",
            "--disable-frontend-multiprocessing",
        ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
+        # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
+        # stdout/stderr pipes are enabled during ROCm GPU initialization.
+        stdout=None if _IS_ROCM else subprocess.PIPE,
+        stderr=None if _IS_ROCM else subprocess.PIPE,
+        text=None if _IS_ROCM else True,
        preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
    )

@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure():
    )

    # Poll until server is ready
-    while time.time() - start_time < 30:
+    while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
        try:
            await client.completions.create(
                model=MODEL_NAME, prompt="Hello", max_tokens=1
@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure():
        except Exception:
            time.sleep(0.5)
            if proc.poll() is not None:
-                stdout, stderr = proc.communicate(timeout=1)
-                pytest.fail(
-                    f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
-                )
+                if _IS_ROCM:
+                    pytest.fail(f"Server died during startup: {proc.returncode}")
+                else:
+                    stdout, stderr = proc.communicate(timeout=1)
+                    pytest.fail(
+                        f"Server died during startup. "
+                        f"stdout: {stdout}, stderr: {stderr}"
+                    )
    else:
        proc.terminate()
-        proc.wait(timeout=5)
-        pytest.fail("Server failed to start in 30 seconds")
+        proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
+        pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")

    # Kill server to simulate crash
    proc.terminate()
@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure():
            model=MODEL_NAME, prompt="This should fail", max_tokens=1
        )

-    return_code = proc.wait(timeout=5)
+    return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
    assert return_code is not None
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -7,6 +7,7 @@ import json
 import pytest

 from ...utils import RemoteOpenAIServer
+from .conftest import add_attention_backend

 MISTRAL_FORMAT_ARGS = [
    "--tokenizer_mode",
@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [

@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
-async def test_basic_audio(mary_had_lamb, model_name):
+async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
    server_args = ["--enforce-eager"]

    if model_name.startswith("mistralai"):
        server_args += MISTRAL_FORMAT_ARGS

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):


@pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
    model_name = "ibm-granite/granite-speech-3.3-2b"
    lora_model_name = "speech"
    server_args = [
@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
        "--max-num-seqs",
        "1",
    ]

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):


@pytest.mark.asyncio
-async def test_basic_audio_gemma(foscolo):
+async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
    # Gemma accuracy on some of the audio samples we use is particularly bad,
    # hence we use a different one here. WER is evaluated separately.
    model_name = "google/gemma-3n-E2B-it"
    server_args = ["--enforce-eager"]

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    with RemoteOpenAIServer(
        model_name, server_args, max_wait_seconds=480
    ) as remote_server:
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@ -14,16 +14,26 @@ import pytest_asyncio
 import soundfile as sf

 from ...utils import RemoteOpenAIServer
+from .conftest import add_attention_backend

 SERVER_ARGS = ["--enforce-eager"]


+def _get_server_args(attention_config):
+    """Get server args with attention backend if specified."""
+    args = SERVER_ARGS.copy()
+    add_attention_backend(args, attention_config)
+    return args
+
+
@pytest.fixture(
    scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
 )
-def server(request):
+def server(request, rocm_aiter_fa_attention):
    # Parametrize over model name
-    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
+        request.param, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
        yield remote_server, request.param


@ -35,10 +45,12 @@ async def client_and_model(server):


@pytest.mark.asyncio
-async def test_non_asr_model(foscolo):
+async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
    # text to text model
    model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
        client = remote_server.get_async_client()
        res = await client.audio.translations.create(
            model=model_name, file=foscolo, temperature=0.0
@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):


@pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    """Ensure STT (translate) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
    # NOTE - careful to call this test before the module scoped server
    # fixture, otherwise it'll OOMkill the CI
    model_name = "ibm-granite/granite-speech-3.3-2b"
@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
        "--max-num-seqs",
        "1",
    ]

+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@ -7,7 +7,8 @@ import openai
 import pytest
 import pytest_asyncio

-from vllm.multimodal.utils import encode_video_base64, fetch_video
+from vllm.multimodal.utils import encode_video_url, fetch_video
+from vllm.platforms import current_platform

 from ...utils import RemoteOpenAIServer

@ -37,7 +38,16 @@ def server():
        json.dumps({"video": MAXIMUM_VIDEOS}),
    ]

-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # ROCm: Increase timeouts to handle potential network delays and slower
+    # video processing when downloading multiple videos from external sources
+    env_overrides = {}
+    if current_platform.is_rocm():
+        env_overrides = {
+            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
+        }
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
        yield remote_server


@ -48,9 +58,9 @@ async def client(server):


@pytest.fixture(scope="session")
-def base64_encoded_video() -> dict[str, str]:
+def url_encoded_video() -> dict[str, str]:
    return {
-        video_url: encode_video_base64(fetch_video(video_url)[0])
+        video_url: encode_video_url(fetch_video(video_url)[0])
        for video_url in TEST_VIDEO_URLS
    }

@ -175,11 +185,9 @@ async def test_single_chat_session_video_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
    video_url: str,
-    base64_encoded_video: dict[str, str],
+    url_encoded_video: dict[str, str],
 ):
-    messages = dummy_messages_from_video_url(
-        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-    )
+    messages = dummy_messages_from_video_url(url_encoded_video[video_url])

    # test single completion
    chat_completion = await client.chat.completions.create(
@ -223,11 +231,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
    client: openai.AsyncOpenAI,
    model_name: str,
    video_url: str,
-    base64_encoded_video: dict[str, str],
+    url_encoded_video: dict[str, str],
 ):
-    messages = dummy_messages_from_video_url(
-        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
-    )
+    messages = dummy_messages_from_video_url(url_encoded_video[video_url])

    chat_completion = await client.chat.completions.create(
        model=model_name,
@ -291,6 +297,11 @@ async def test_chat_streaming_video(
@pytest.mark.parametrize(
    "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
 )
+@pytest.mark.flaky(
+    reruns=2,
+    reruns_delay=5,
+    condition=current_platform.is_rocm(),
+)
 async def test_multi_video_input(
    client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
 ):
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@ -9,7 +9,8 @@ import pytest_asyncio
 from transformers import AutoProcessor

 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_url, fetch_image
+from vllm.platforms import current_platform

 from ...utils import RemoteOpenAIServer

@ -35,7 +36,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
    ],
    [
        "The image shows a Venn diagram with three over",
-        "The image shows a colorful Venn diagram with",
+        "The image displays a Venn diagram with three over",
    ],
    [
        "This image displays a gradient of colors ranging from",
@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
    ],
 ]

+EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [
+    # MultiHeadAttention attn_backend: FLASH_ATTN
+    # with Triton Attention backend
+    [
+        "The image shows a wooden boardwalk leading through a",
+        "The image shows a wooden boardwalk extending into a",
+    ],
+    [
+        "The image shows two parrots perched on",
+        "The image shows two birds perched on a cur",
+    ],
+    [
+        "The image shows a Venn diagram with three over",
+        "The image contains a Venn diagram with three over",
+    ],
+    [
+        "This image displays a gradient of colors ranging from",
+        "This image displays a gradient of colors transitioning from",
+    ],
+]
+

@pytest.fixture(scope="module")
 def server():
@ -59,7 +81,16 @@ def server():
        json.dumps({"image": MAXIMUM_IMAGES}),
    ]

-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # ROCm: Increase timeouts to handle potential network delays and slower
+    # video processing when downloading multiple videos from external sources
+    env_overrides = {}
+    if current_platform.is_rocm():
+        env_overrides = {
+            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
+            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
+        }
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
        yield remote_server


@ -70,11 +101,9 @@ async def client(server):


@pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
    return {
-        image_asset: encode_image_base64(
-            local_asset_server.get_image_asset(image_asset)
-        )
+        image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
        for image_asset in TEST_IMAGE_ASSETS
    }

@ -234,11 +263,11 @@ async def test_single_chat_session_image_base64encoded(
    model_name: str,
    raw_image_url: str,
    image_url: str,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
    content_text = "What's in this image?"
    messages = dummy_messages_from_image_url(
-        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
+        url_encoded_image[raw_image_url],
        content_text,
    )

@ -288,15 +317,20 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
    client: openai.AsyncOpenAI,
    model_name: str,
    image_idx: int,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
+    # ROCm: Switch expected results based on platform
+    from vllm.platforms import current_platform
+
    # NOTE: This test also validates that we pass MM data through beam search
    raw_image_url = TEST_IMAGE_ASSETS[image_idx]
-    expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]

-    messages = dummy_messages_from_image_url(
-        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
-    )
+    if current_platform.is_rocm():
+        expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx]
+    else:
+        expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
+
+    messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])

    chat_completion = await client.chat.completions.create(
        model=model_name,
--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
    ]


+@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
 )
--- a/tests/entrypoints/pooling/basic/test_encode.py
+++ b/tests/entrypoints/pooling/basic/test_encode.py
@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"

 PROMPTS = [
@ -35,6 +30,12 @@ TOKEN_IDS = [

@pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -44,6 +45,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
+        attention_config=attention_config,
    )

    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/basic/test_truncation.py
+++ b/tests/entrypoints/pooling/basic/test_truncation.py
@ -9,11 +9,6 @@ import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
 max_model_len = 128

@ -44,6 +39,10 @@ def server():
        str(max_model_len),
    ]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/conftest.py
+++ b/tests/entrypoints/pooling/embed/conftest.py
@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Pytest configuration for vLLM pooling embed tests."""
+
+import warnings
+
+import torch
+
+from vllm.platforms import current_platform
+
+
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
+    if not current_platform.is_rocm():
+        return
+
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+    warnings.warn(
+        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
+        "to avoid HuggingFace Transformers accuracy issues",
+        UserWarning,
+        stacklevel=1,
+    )
--- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
@ -4,7 +4,7 @@ import os

 import pytest

-from tests.models.language.pooling_mteb_test.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
    MTEB_EMBED_TASKS,
    MTEB_EMBED_TOL,
    OpenAIClientMtebEncoder,
@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

 MODEL_NAME = "intfloat/e5-small"
@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
 def server():
    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"

 prompts = ["The chef prepared a delicious meal."]
@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]

@pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -32,6 +33,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
+        attention_config=attention_config,
    )

    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
    decode_pooling_output,
 )

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"


+if current_platform.is_rocm():
+    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
+    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
+    torch.backends.cuda.enable_flash_sdp(False)
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_math_sdp(True)
+
+
@pytest.fixture(scope="module")
 def server():
    args = [
@ -53,6 +57,10 @@ def server():
        DUMMY_CHAT_TEMPLATE,
    ]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/test_online_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODELS = [
    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
    EmbedModelInfo(
@ -62,6 +57,10 @@ def server(model_info, dtype: str):
            ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
        )

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(model_info.name, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-

 def _generate_random_text(word_count: int) -> str:
    """Generate random text with approximately the specified word count."""
@ -228,6 +223,10 @@ def server_with_chunked_processing():
        "0.8",
    ]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@ -10,7 +10,7 @@ from transformers import AutoProcessor
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import fetch_image

 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2
@ -48,14 +48,6 @@ def server():
        yield remote_server


-@pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
-    return {
-        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
-        for image_url in TEST_IMAGE_ASSETS
-    }
-
-
 def get_hf_prompt_tokens(model_name, content, image_url):
    processor = AutoProcessor.from_pretrained(
        model_name, trust_remote_code=True, num_crops=4
--- a/tests/entrypoints/pooling/score/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py
@ -4,7 +4,7 @@ import os

 import pytest

-from tests.models.language.pooling_mteb_test.mteb_utils import (
+from tests.models.language.pooling_mteb_test.mteb_score_utils import (
    MTEB_RERANK_LANGS,
    MTEB_RERANK_TASKS,
    MTEB_RERANK_TOL,
@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"

 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@ -30,6 +25,10 @@ st_main_score = 0.33457
 def server():
    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/score/test_offline.py
+++ b/tests/entrypoints/pooling/score/test_offline.py
@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"


@pytest.fixture(scope="module")
 def llm():
+    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
+    # that supports encoder-only models on ROCm.
+    attention_config = None
+    if current_platform.is_rocm():
+        attention_config = {"backend": "FLEX_ATTENTION"}
+
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -30,6 +31,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
+        attention_config=attention_config,
    )

    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.entrypoints.pooling.score.protocol import RerankResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"

@ -24,6 +19,10 @@ DTYPE = "bfloat16"
 def server():
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.score.protocol import ScoreResponse
 from vllm.platforms import current_platform

-if current_platform.is_rocm():
-    pytest.skip(
-        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
-    )
-
 MODELS = [
    {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
    {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
@ -44,6 +39,10 @@ def model(request):
 def server(model: dict[str, Any]):
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]

+    # ROCm: Use Flex Attention to support encoder-only self-attention.
+    if current_platform.is_rocm():
+        args.extend(["--attention-backend", "FLEX_ATTENTION"])
+
    with RemoteOpenAIServer(model["name"], args) as remote_server:
        yield remote_server

--- a/tests/entrypoints/pooling/score/test_utils.py
+++ b/tests/entrypoints/pooling/score/test_utils.py
@ -202,11 +202,10 @@ class TestGetScorePrompt:
        tokenization_kwargs,
        mock_model_no_score_template,
    ):
-        # FIXME: Models implementing SupportsScoreTemplate must use their custom
-        # template implementation by default to preserve existing functionality.
-        # Attempting to use tokenizer_config.json templates would most likely break
-        # these models, as often they just inherit the template from the original LLM.
-        # CLI --chat-template overrides are still supported.
+        # FIXME: For now, we only apply a template when one is explicitly provided.
+        # We cannot rely on the tokenizer's chat template because many models
+        # inherit junk templates from their base LLM, which breaks both the models
+        # and the tests that use them.
        with (
            patch(
                "vllm.model_executor.model_loader.get_model_cls",
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
 )
 from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (
-    encode_audio_base64,
-    encode_image_base64,
-    encode_video_base64,
+    encode_audio_url,
+    encode_image_url,
+    encode_video_url,
 )
 from vllm.tokenizers import get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
@ -141,22 +141,19 @@ def mistral_model_config():
@pytest.fixture(scope="module")
 def image_url():
    image = ImageAsset("cherry_blossom")
-    base64 = encode_image_base64(image.pil_image)
-    return f"data:image/jpeg;base64,{base64}"
+    return encode_image_url(image.pil_image)


@pytest.fixture(scope="module")
 def video_url():
    video = VideoAsset("baby_reading", 1)
-    base64 = encode_video_base64(video.np_ndarrays)
-    return f"data:video/jpeg;base64,{base64}"
+    return encode_video_url(video.np_ndarrays)


@pytest.fixture(scope="module")
 def audio_url():
    audio = AudioAsset("mary_had_lamb")
-    base64 = encode_audio_base64(*audio.audio_and_sample_rate)
-    return f"data:audio/ogg;base64,{base64}"
+    return encode_audio_url(*audio.audio_and_sample_rate)


 def _assert_mm_data_is_image_input(
--- a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
@ -0,0 +1,11 @@
+model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --max-model-len 4096
+  --tensor-parallel-size 2
+  --enable-expert-parallel
+  --async-scheduling
+env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
--- a/tests/evals/gsm8k/configs/models-blackwell.txt
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml
 DeepSeek-V2-Lite-Instruct-FP8.yaml
 Qwen3-30B-A3B-NVFP4.yaml
 Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+Qwen3-Next-FP8-EP2.yaml
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename):
    print(f"Number of questions: {eval_config['num_questions']}")
    print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
    print(f"Server args: {' '.join(server_args)}")
+    print(f"Environment variables: {env_dict}")

    # Launch server and run evaluation
    with RemoteOpenAIServer(
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"]
 RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]


-@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
-@pytest.mark.parametrize("num_layers", NUM_LAYERS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@torch.inference_mode()
-def test_copy_blocks(
-    kv_cache_factory,
-    num_mappings: int,
-    num_layers: int,
-    num_heads: int,
-    head_size: int,
-    block_size: int,
-    num_blocks: int,
-    dtype: torch.dtype,
-    seed: int,
-    kv_cache_dtype: str,
-    device: str,
-) -> None:
-    if kv_cache_dtype == "fp8" and head_size % 16:
-        pytest.skip()
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    torch.cuda.set_device(device)
-    # Generate random block mappings where each source block is mapped to two
-    # destination blocks.
-    assert 2 * num_mappings <= num_blocks
-    src_blocks = random.sample(range(num_blocks), num_mappings)
-    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
-    dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
-    block_mapping: list[tuple[int, int]] = []
-    for i in range(num_mappings):
-        src = src_blocks[i]
-        dst1 = dst_blocks[2 * i]
-        dst2 = dst_blocks[2 * i + 1]
-        block_mapping.append((src, dst1))
-        block_mapping.append((src, dst2))
-
-    # Create the KV caches.
-    key_caches, value_caches = kv_cache_factory(
-        num_blocks,
-        block_size,
-        num_layers,
-        num_heads,
-        head_size,
-        kv_cache_dtype,
-        dtype,
-        seed,
-        device,
-    )
-
-    # Clone the KV caches.
-    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
-    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
-
-    # Call the copy blocks kernel.
-    block_mapping_tensor = torch.tensor(
-        block_mapping, dtype=torch.int64, device=device
-    ).view(-1, 2)
-
-    opcheck(
-        torch.ops._C_cache_ops.copy_blocks,
-        (key_caches, value_caches, block_mapping_tensor),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-        cond=(head_size == HEAD_SIZES[0]),
-    )
-    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
-
-    # Run the reference implementation.
-    for src, dst in block_mapping:
-        for cloned_key_cache in cloned_key_caches:
-            cloned_key_cache[dst].copy_(cloned_key_cache[src])
-        for cloned_value_cache in cloned_value_caches:
-            cloned_value_cache[dst].copy_(cloned_value_cache[src])
-
-    # Compare the results.
-    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
-        torch.testing.assert_close(key_cache, cloned_key_cache)
-    for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
-        torch.testing.assert_close(value_cache, cloned_value_cache)
-
-
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla(
        torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)


-@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
-@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
-@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
-@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
-@pytest.mark.parametrize("num_layers", NUM_LAYERS)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@torch.inference_mode()
-def test_copy_blocks_mla(
-    kv_lora_rank: int,
-    qk_rope_head_dim: int,
-    block_size: int,
-    num_blocks: int,
-    num_layers: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    kv_cache_dtype: str,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    torch.cuda.set_device(device)
-
-    entry_size = kv_lora_rank + qk_rope_head_dim
-
-    kv_caches = []
-    for _ in range(num_layers):
-        kv_cache = _create_mla_cache(
-            num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
-        )
-        _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
-        kv_caches.append(kv_cache)
-
-    ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
-
-    num_mappings = min(2, num_blocks // 2)
-    src_blocks = random.sample(range(num_blocks), num_mappings)
-    remaining = list(set(range(num_blocks)) - set(src_blocks))
-    dst_blocks = random.sample(remaining, 2 * num_mappings)
-    block_mapping = []
-    for i in range(num_mappings):
-        src = src_blocks[i]
-        dst1 = dst_blocks[2 * i]
-        dst2 = dst_blocks[2 * i + 1]
-        block_mapping.append((src, dst1))
-        block_mapping.append((src, dst2))
-    block_mapping_tensor = torch.tensor(
-        block_mapping, dtype=torch.int64, device=device
-    ).view(-1, 2)
-
-    for src, dst in block_mapping:
-        for ref_cache in ref_caches:
-            ref_cache[dst].copy_(ref_cache[src])
-
-    opcheck(
-        torch.ops._C_cache_ops.copy_blocks_mla,
-        (kv_caches, block_mapping_tensor),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-    )
-    ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
-
-    for kv_cache, ref_cache in zip(kv_caches, ref_caches):
-        torch.testing.assert_close(kv_cache, ref_cache)
-
-
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
--- a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py
@ -0,0 +1,228 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import mteb
+import numpy as np
+import torch
+from mteb.models import ModelMeta
+from mteb.types import Array
+from torch.utils.data import DataLoader
+
+import tests.ci_envs as ci_envs
+from tests.models.utils import (
+    EmbedModelInfo,
+    check_embeddings_close,
+    get_vllm_extra_kwargs,
+)
+
+# Most embedding models on the STS12 task (See #17175):
+# - Model implementation and minor changes in tensor dtype
+#   results in differences less than 1e-4
+# - Different model results in differences more than 1e-3
+# 1e-4 is a good tolerance threshold
+MTEB_EMBED_TASKS = ["STS12"]
+MTEB_EMBED_TOL = 1e-4
+
+
+_empty_model_meta = ModelMeta(
+    loader=None,
+    name="vllm/model",
+    revision="1",
+    release_date=None,
+    languages=None,
+    framework=[],
+    similarity_fn_name=None,
+    n_parameters=None,
+    memory_usage_mb=None,
+    max_tokens=None,
+    embed_dim=None,
+    license=None,
+    open_weights=None,
+    public_training_code=None,
+    public_training_data=None,
+    use_instructions=None,
+    training_datasets=None,
+    modalities=["text"],  # 'image' can be added to evaluate multimodal models
+)
+
+
+class MtebEmbedMixin(mteb.EncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
+    def similarity(
+        self,
+        embeddings1: np.ndarray,
+        embeddings2: np.ndarray,
+    ) -> np.ndarray:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
+        return sim
+
+    def similarity_pairwise(
+        self,
+        embeddings1: Array,
+        embeddings2: Array,
+    ) -> Array:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.sum(embeddings1 * embeddings2, axis=1) / (
+            norm1.flatten() * norm2.flatten()
+        )
+        return sim
+
+
+class VllmMtebEncoder(MtebEmbedMixin):
+    def __init__(self, vllm_model):
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+
+    def encode(
+        self,
+        inputs: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+        outputs = self.llm.embed(sentences, use_tqdm=False)
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
+
+
+class OpenAIClientMtebEncoder(MtebEmbedMixin):
+    def __init__(self, model_name: str, client):
+        self.model_name = model_name
+        self.client = client
+        self.rng = np.random.default_rng(seed=42)
+
+    def encode(
+        self,
+        inputs: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
+        # Hoping to discover potential scheduling
+        # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
+        r = self.rng.permutation(len(sentences))
+        sentences = [sentences[i] for i in r]
+
+        embeddings = self.client.embeddings.create(
+            model=self.model_name, input=sentences
+        )
+        outputs = [d.embedding for d in embeddings.data]
+        embeds = np.array(outputs)
+        embeds = embeds[np.argsort(r)]
+        return embeds
+
+
+def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
+    tasks = mteb.get_tasks(tasks=tasks)
+    results = mteb.evaluate(
+        encoder,
+        tasks,
+        cache=None,
+        show_progress_bar=False,
+    )
+
+    main_score = results[0].scores["test"][0]["main_score"]
+    return main_score
+
+
+def mteb_test_embed_models(
+    hf_runner,
+    vllm_runner,
+    model_info: EmbedModelInfo,
+    vllm_extra_kwargs=None,
+    hf_model_callback=None,
+    atol=MTEB_EMBED_TOL,
+):
+    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
+
+    # Test embed_dims, isnan and whether to use normalize
+    example_prompts = ["The chef prepared a delicious meal." * 1000]
+
+    with vllm_runner(
+        model_info.name,
+        runner="pooling",
+        max_model_len=model_info.max_model_len,
+        **vllm_extra_kwargs,
+    ) as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+
+        # Confirm whether vllm is using the correct architecture
+        if model_info.architecture:
+            assert model_info.architecture in model_config.architectures
+
+        # Confirm whether the important configs in model_config are correct.
+        if model_info.pooling_type is not None:
+            assert model_config.pooler_config.pooling_type == model_info.pooling_type
+        if model_info.attn_type is not None:
+            assert model_config.attn_type == model_info.attn_type
+        if model_info.is_prefix_caching_supported is not None:
+            assert (
+                model_config.is_prefix_caching_supported
+                == model_info.is_prefix_caching_supported
+            )
+        if model_info.is_chunked_prefill_supported is not None:
+            assert (
+                model_config.is_chunked_prefill_supported
+                == model_info.is_chunked_prefill_supported
+            )
+
+        vllm_main_score = run_mteb_embed_task(
+            VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
+        )
+        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
+        head_dtype = model_config.head_dtype
+
+        # Test embedding_size, isnan and whether to use normalize
+        vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
+        outputs_tensor = torch.tensor(vllm_outputs)
+        assert not torch.any(torch.isnan(outputs_tensor))
+        embedding_size = model_config.embedding_size
+        assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
+
+    # Accelerate mteb test by setting
+    # SentenceTransformers mteb score to a constant
+    if model_info.mteb_score is None:
+        with hf_runner(
+            model_info.name,
+            is_sentence_transformer=True,
+            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
+        ) as hf_model:
+            # e.g. setting default parameters for the encode method of hf_runner
+            if hf_model_callback is not None:
+                hf_model_callback(hf_model)
+
+            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
+            st_dtype = next(hf_model.model.parameters()).dtype
+
+            # Check embeddings close to hf outputs
+            hf_outputs = hf_model.encode(example_prompts)
+            check_embeddings_close(
+                embeddings_0_lst=hf_outputs,
+                embeddings_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+                tol=1e-2,
+            )
+    else:
+        st_main_score = model_info.mteb_score
+        st_dtype = "Constant"
+
+    print("Model:", model_info.name)
+    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
+    print("SentenceTransformers:", st_dtype, st_main_score)
+    print("Difference:", st_main_score - vllm_main_score)
+
+    # We are not concerned that the vllm mteb results are better
+    # than SentenceTransformers, so we only perform one-sided testing.
+    assert st_main_score - vllm_main_score < atol
--- a/tests/models/language/pooling_mteb_test/mteb_score_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py
@ -7,37 +7,24 @@ from pathlib import Path
 import mteb
 import numpy as np
 import requests
-import torch
 from mteb.models import ModelMeta
-from mteb.types import Array
 from torch.utils.data import DataLoader

-import tests.ci_envs as ci_envs
 from tests.models.utils import (
-    EmbedModelInfo,
    RerankModelInfo,
-    check_embeddings_close,
    get_vllm_extra_kwargs,
 )

-template_home = (
-    Path(__file__).parent.parent.parent.parent.parent
-    / "examples/pooling/score/template"
-)
-
-# Most embedding models on the STS12 task (See #17175):
-# - Model implementation and minor changes in tensor dtype
-#   results in differences less than 1e-4
-# - Different model results in differences more than 1e-3
-# 1e-4 is a good tolerance threshold
-MTEB_EMBED_TASKS = ["STS12"]
-MTEB_EMBED_TOL = 1e-4
-
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
 MTEB_RERANK_LANGS = ["eng"]
 MTEB_RERANK_TOL = 2e-3

+template_home = (
+    Path(__file__).parent.parent.parent.parent.parent
+    / "examples/pooling/score/template"
+)
+
 _empty_model_meta = ModelMeta(
    loader=None,
    name="vllm/model",
@ -60,84 +47,11 @@ _empty_model_meta = ModelMeta(
 )


-class VllmMtebEncoder(mteb.EncoderProtocol):
+class MtebCrossEncoderMixin(mteb.CrossEncoderProtocol):
    mteb_model_meta = _empty_model_meta

-    def __init__(self, vllm_model):
-        self.llm = vllm_model
-        self.rng = np.random.default_rng(seed=42)
-
-    def encode(
-        self,
-        inputs: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        # Hoping to discover potential scheduling
-        # issues by randomizing the order.
-        sentences = [text for batch in inputs for text in batch["text"]]
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-        outputs = self.llm.embed(sentences, use_tqdm=False)
-        embeds = np.array(outputs)
-        embeds = embeds[np.argsort(r)]
-        return embeds
-
-    def similarity(
-        self,
-        embeddings1: np.ndarray,
-        embeddings2: np.ndarray,
-    ) -> np.ndarray:
-        # Cosine similarity
-        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
-        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
-        sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
-        return sim
-
-    def similarity_pairwise(
-        self,
-        embeddings1: Array,
-        embeddings2: Array,
-    ) -> Array:
-        # Cosine similarity
-        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
-        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
-        sim = np.sum(embeddings1 * embeddings2, axis=1) / (
-            norm1.flatten() * norm2.flatten()
-        )
-        return sim
-
-
-class OpenAIClientMtebEncoder(VllmMtebEncoder):
-    def __init__(self, model_name: str, client):
-        self.model_name = model_name
-        self.client = client
-        self.rng = np.random.default_rng(seed=42)
-
-    def encode(
-        self,
-        inputs: DataLoader[mteb.types.BatchedInput],
-        *args,
-        **kwargs,
-    ) -> np.ndarray:
-        # Hoping to discover potential scheduling
-        # issues by randomizing the order.
-        sentences = [text for batch in inputs for text in batch["text"]]
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-
-        embeddings = self.client.embeddings.create(
-            model=self.model_name, input=sentences
-        )
-        outputs = [d.embedding for d in embeddings.data]
-        embeds = np.array(outputs)
-        embeds = embeds[np.argsort(r)]
-        return embeds
-
-
-class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
-    mteb_model_meta = _empty_model_meta

+class VllmMtebCrossEncoder(MtebCrossEncoderMixin):
    def __init__(self, vllm_model):
        self.llm = vllm_model
        self.rng = np.random.default_rng(seed=42)
@ -164,7 +78,7 @@ class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
        return scores


-class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
+class ScoreClientMtebEncoder(MtebCrossEncoderMixin):
    mteb_model_meta = _empty_model_meta

    def __init__(self, model_name: str, url):
@ -216,102 +130,6 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
        return response["results"][0]["relevance_score"]


-def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
-    tasks = mteb.get_tasks(tasks=tasks)
-    results = mteb.evaluate(
-        encoder,
-        tasks,
-        cache=None,
-        show_progress_bar=False,
-    )
-
-    main_score = results[0].scores["test"][0]["main_score"]
-    return main_score
-
-
-def mteb_test_embed_models(
-    hf_runner,
-    vllm_runner,
-    model_info: EmbedModelInfo,
-    vllm_extra_kwargs=None,
-    hf_model_callback=None,
-    atol=MTEB_EMBED_TOL,
-):
-    vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
-
-    # Test embed_dims, isnan and whether to use normalize
-    example_prompts = ["The chef prepared a delicious meal." * 1000]
-
-    with vllm_runner(
-        model_info.name,
-        runner="pooling",
-        max_model_len=model_info.max_model_len,
-        **vllm_extra_kwargs,
-    ) as vllm_model:
-        model_config = vllm_model.llm.llm_engine.model_config
-
-        # Confirm whether vllm is using the correct architecture
-        if model_info.architecture:
-            assert model_info.architecture in model_config.architectures
-
-        # Confirm whether vllm uses the correct default_pooling_type, which
-        # relates to whether chunked prefill and prefix caching are enabled
-        assert (
-            model_config._model_info.default_pooling_type
-            == model_info.default_pooling_type
-        )
-
-        vllm_main_score = run_mteb_embed_task(
-            VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS
-        )
-        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
-        head_dtype = model_config.head_dtype
-
-        # Test embedding_size, isnan and whether to use normalize
-        vllm_outputs = vllm_model.embed(example_prompts, truncate_prompt_tokens=-1)
-        outputs_tensor = torch.tensor(vllm_outputs)
-        assert not torch.any(torch.isnan(outputs_tensor))
-        embedding_size = model_config.embedding_size
-        assert torch.tensor(vllm_outputs).shape[-1] == embedding_size
-
-    # Accelerate mteb test by setting
-    # SentenceTransformers mteb score to a constant
-    if model_info.mteb_score is None:
-        with hf_runner(
-            model_info.name,
-            is_sentence_transformer=True,
-            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
-        ) as hf_model:
-            # e.g. setting default parameters for the encode method of hf_runner
-            if hf_model_callback is not None:
-                hf_model_callback(hf_model)
-
-            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
-            st_dtype = next(hf_model.model.parameters()).dtype
-
-            # Check embeddings close to hf outputs
-            hf_outputs = hf_model.encode(example_prompts)
-            check_embeddings_close(
-                embeddings_0_lst=hf_outputs,
-                embeddings_1_lst=vllm_outputs,
-                name_0="hf",
-                name_1="vllm",
-                tol=1e-2,
-            )
-    else:
-        st_main_score = model_info.mteb_score
-        st_dtype = "Constant"
-
-    print("Model:", model_info.name)
-    print("VLLM:", f"dtype:{vllm_dtype}", f"head_dtype:{head_dtype}", vllm_main_score)
-    print("SentenceTransformers:", st_dtype, st_main_score)
-    print("Difference:", st_main_score - vllm_main_score)
-
-    # We are not concerned that the vllm mteb results are better
-    # than SentenceTransformers, so we only perform one-sided testing.
-    assert st_main_score - vllm_main_score < atol
-
-
 def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
    with tempfile.TemporaryDirectory() as prediction_folder:
        bm25s = mteb.get_model("bm25s")
@ -391,18 +209,28 @@ def mteb_test_rerank_models(
        # Score API is only enabled for num_labels == 1
        assert model_config.hf_config.num_labels == 1

-        # Confirm whether vllm uses the correct default_pooling_type, which
-        # relates to whether chunked prefill and prefix caching are enabled
-        assert (
-            model_config._model_info.default_pooling_type
-            == model_info.default_pooling_type
-        )
-
+        # Maybe load chat_template.
        chat_template: str | None = None
        if model_info.chat_template_name is not None:
            chat_template = (template_home / model_info.chat_template_name).read_text()
        vllm_model.chat_template = chat_template

+        # Confirm whether the important configs in model_config are correct.
+        if model_info.pooling_type is not None:
+            assert model_config.pooler_config.pooling_type == model_info.pooling_type
+        if model_info.attn_type is not None:
+            assert model_config.attn_type == model_info.attn_type
+        if model_info.is_prefix_caching_supported is not None:
+            assert (
+                model_config.is_prefix_caching_supported
+                == model_info.is_prefix_caching_supported
+            )
+        if model_info.is_chunked_prefill_supported is not None:
+            assert (
+                model_config.is_chunked_prefill_supported
+                == model_info.is_chunked_prefill_supported
+            )
+
        vllm_main_score = run_mteb_rerank(
            vllm_mteb_encoder(vllm_model),
            tasks=MTEB_RERANK_TASKS,
--- a/tests/models/language/pooling_mteb_test/test_baai.py
+++ b/tests/models/language/pooling_mteb_test/test_baai.py
@ -4,90 +4,94 @@ import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
    EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
    RerankModelInfo,
 )

-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-base-en",
        architecture="BertModel",
        mteb_score=0.779336792,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-base-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-small-en", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-small-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-large-en", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "BAAI/bge-large-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
        "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
    ),
    ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-m3",
        architecture="XLMRobertaModel",
        mteb_score=0.787343078,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
    ########## Qwen2Model
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "BAAI/bge-code-v1",
        architecture="Qwen2Model",
        mteb_score=0.75724465,
        dtype="float32",
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
        enable_test=True,
    ),
 ]

 RERANK_MODELS = [
    ########## XLMRobertaForSequenceClassification
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "BAAI/bge-reranker-base",
        architecture="XLMRobertaForSequenceClassification",
        mteb_score=0.32398,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "BAAI/bge-reranker-large",
        architecture="XLMRobertaForSequenceClassification",
        enable_test=False,
    ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "BAAI/bge-reranker-v2-m3",
        architecture="XLMRobertaForSequenceClassification",
        enable_test=False,
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@ -9,14 +9,12 @@ import torch
 from torch.utils.data import DataLoader

 from tests.conftest import HfRunner
-from tests.models.language.pooling_mteb_test.mteb_utils import (
-    VllmMtebCrossEncoder,
-    mteb_test_rerank_models,
-)
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from tests.models.utils import RerankModelInfo
+
+from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "BAAI/bge-reranker-v2-gemma",
        architecture="GemmaForSequenceClassification",
        mteb_score=0.33757,
@ -25,6 +23,10 @@ RERANK_MODELS = [
            "classifier_from_token": ["Yes"],
            "method": "no_post_processing",
        },
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
    ),
 ]

--- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py
+++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py
@ -3,23 +3,29 @@
 import pytest

 from tests.models.utils import (
-    CLSPoolingRerankModelInfo,
-    LASTPoolingRerankModelInfo,
    RerankModelInfo,
 )

-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import mteb_test_rerank_models

 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "cross-encoder/ms-marco-TinyBERT-L-2-v2",
        mteb_score=0.32898,
        architecture="BertForSequenceClassification",
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
        mteb_score=0.25736,
        architecture="Qwen3ForSequenceClassification",
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
    ),
 ]

--- a/tests/models/language/pooling_mteb_test/test_gte.py
+++ b/tests/models/language/pooling_mteb_test/test_gte.py
@ -5,36 +5,32 @@ import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
    EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
    RerankModelInfo,
 )

-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "thenlper/gte-large",
        mteb_score=0.76807651,
        architecture="BertModel",
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-base", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-small", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
        "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
-        "thenlper/gte-base-zh", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
        "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
    ),
    ########### NewModel
@ -43,48 +39,64 @@ MODELS = [
    # - whether to use token_type_embeddings
    # - whether to use context expansion
    # So only test one (the most widely used) model
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-multilingual-base",
        architecture="GteNewModel",
        mteb_score=0.775074696,
        hf_overrides={"architectures": ["GteNewModel"]},
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-base-en-v1.5",
        architecture="GteNewModel",
        hf_overrides={"architectures": ["GteNewModel"]},
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-large-en-v1.5",
        architecture="GteNewModel",
        hf_overrides={"architectures": ["GteNewModel"]},
        enable_test=False,
    ),
    ########### Qwen2ForCausalLM
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
        mteb_score=0.758473459018872,
        architecture="Qwen2ForCausalLM",
+        pooling_type="LAST",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
    ########## ModernBertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Alibaba-NLP/gte-modernbert-base",
        mteb_score=0.748193353,
        architecture="ModernBertModel",
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
    ########## Qwen3ForCausalLM
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Qwen/Qwen3-Embedding-0.6B",
        mteb_score=0.771163695,
        architecture="Qwen3ForCausalLM",
        dtype="float32",
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
        enable_test=True,
    ),
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Qwen/Qwen3-Embedding-4B",
        architecture="Qwen3ForCausalLM",
        dtype="float32",
@ -93,18 +105,26 @@ MODELS = [
 ]

 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        # classifier_pooling: mean
        "Alibaba-NLP/gte-reranker-modernbert-base",
        mteb_score=0.33386,
        architecture="ModernBertForSequenceClassification",
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "Alibaba-NLP/gte-multilingual-reranker-base",
        mteb_score=0.33062,
        architecture="GteNewForSequenceClassification",
        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
 ]
--- a/tests/models/language/pooling_mteb_test/test_intfloat.py
+++ b/tests/models/language/pooling_mteb_test/test_intfloat.py
@ -3,40 +3,44 @@
 import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo

-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models

 MODELS = [
    ########## BertModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "intfloat/e5-small",
        architecture="BertModel",
        mteb_score=0.742285423,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
-        "intfloat/e5-base", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
-        "intfloat/e5-large", architecture="BertModel", enable_test=False
-    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
+    EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
+    EmbedModelInfo(
        "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
    ),
    ########## XLMRobertaModel
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "intfloat/multilingual-e5-base",
        architecture="XLMRobertaModel",
        mteb_score=0.779325955,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "intfloat/multilingual-e5-large",
        architecture="XLMRobertaModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "intfloat/multilingual-e5-large-instruct",
        architecture="XLMRobertaModel",
        enable_test=False,
--- a/tests/models/language/pooling_mteb_test/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@ -10,30 +10,37 @@ from tests.models.language.pooling.embed_utils import (
    matryoshka_fy,
 )
 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
-    CLSPoolingRerankModelInfo,
    EmbedModelInfo,
    RerankModelInfo,
 )
 from vllm import PoolingParams

-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
+from .mteb_embed_utils import mteb_test_embed_models
+from .mteb_score_utils import mteb_test_rerank_models

 EMBEDDING_MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "jinaai/jina-embeddings-v3",
        mteb_score=0.824413164,
        architecture="XLMRobertaModel",
        is_matryoshka=True,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        dtype="float32",
    )
 ]

 RERANK_MODELS = [
-    CLSPoolingRerankModelInfo(
+    RerankModelInfo(
        "jinaai/jina-reranker-v2-base-multilingual",
        mteb_score=0.33643,
        architecture="XLMRobertaForSequenceClassification",
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    )
 ]

--- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@ -6,9 +6,9 @@ import pytest
 import torch

 from tests.conftest import HfRunner
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from tests.models.utils import RerankModelInfo

-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import mteb_test_rerank_models

 mxbai_rerank_hf_overrides = {
    "architectures": ["Qwen2ForSequenceClassification"],
@ -17,14 +17,18 @@ mxbai_rerank_hf_overrides = {
 }

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "mixedbread-ai/mxbai-rerank-base-v2",
        architecture="Qwen2ForSequenceClassification",
        hf_overrides=mxbai_rerank_hf_overrides,
        mteb_score=0.273,
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
        enable_test=True,
    ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "mixedbread-ai/mxbai-rerank-large-v2",
        architecture="Qwen2ForSequenceClassification",
        hf_overrides=mxbai_rerank_hf_overrides,
--- a/tests/models/language/pooling_mteb_test/test_nemotron.py
+++ b/tests/models/language/pooling_mteb_test/test_nemotron.py
@ -3,29 +3,39 @@

 import pytest

+from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
+    mteb_test_embed_models,
+)
+from tests.models.language.pooling_mteb_test.mteb_score_utils import (
+    mteb_test_rerank_models,
+)
 from tests.models.utils import (
    EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
-    LASTPoolingRerankModelInfo,
    RerankModelInfo,
 )

-from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
-
 EMBEDDING_MODELS = [
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nvidia/llama-nemotron-embed-1b-v2",
        architecture="LlamaBidirectionalModel",
        mteb_score=0.689164662128673,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    )
 ]

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "nvidia/llama-nemotron-rerank-1b-v2",
        architecture="LlamaBidirectionalForSequenceClassification",
        chat_template_name="nemotron-rerank.jinja",
        mteb_score=0.33994,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    ),
 ]

--- a/tests/models/language/pooling_mteb_test/test_nomic.py
+++ b/tests/models/language/pooling_mteb_test/test_nomic.py
@ -4,30 +4,38 @@
 import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo

-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models

 MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nomic-ai/nomic-embed-text-v1",
        architecture="NomicBertModel",
        mteb_score=0.737568559,
        enable_test=True,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nomic-ai/nomic-embed-text-v1.5",
        architecture="NomicBertModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "nomic-ai/nomic-embed-text-v2-moe",
        architecture="NomicBertModel",
        mteb_score=0.715488912,
        enable_test=True,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
    ),
 ]

--- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@ -6,10 +6,10 @@ import pytest
 import torch

 from tests.conftest import HfRunner
-from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
+from tests.models.utils import RerankModelInfo
 from tests.utils import multi_gpu_test

-from .mteb_utils import mteb_test_rerank_models
+from .mteb_score_utils import mteb_test_rerank_models

 qwen3_reranker_hf_overrides = {
    "architectures": ["Qwen3ForSequenceClassification"],
@ -18,14 +18,18 @@ qwen3_reranker_hf_overrides = {
 }

 RERANK_MODELS = [
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "Qwen/Qwen3-Reranker-0.6B",
        architecture="Qwen3ForSequenceClassification",
        mteb_score=0.25736,
        hf_overrides=qwen3_reranker_hf_overrides,
+        pooling_type="LAST",
+        attn_type="decoder",
+        is_prefix_caching_supported=True,
+        is_chunked_prefill_supported=True,
        enable_test=True,
    ),
-    LASTPoolingRerankModelInfo(
+    RerankModelInfo(
        "Qwen/Qwen3-Reranker-4B",
        architecture="Qwen3ForSequenceClassification",
        hf_overrides=qwen3_reranker_hf_overrides,
--- a/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling_mteb_test/test_snowflake_arctic_embed.py
@ -4,62 +4,82 @@
 import pytest

 from tests.models.language.pooling.embed_utils import correctness_test_embed_models
-from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
+from tests.models.utils import EmbedModelInfo

-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models

 MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-xs",
        is_matryoshka=False,
        architecture="BertModel",
        mteb_score=0.714927797,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-s",
        is_matryoshka=False,
        architecture="BertModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-m",
        is_matryoshka=False,
        architecture="BertModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-m-long",
        is_matryoshka=False,
        architecture="NomicBertModel",
        mteb_score=0.681146831,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-l",
        is_matryoshka=False,
        architecture="BertModel",
        enable_test=False,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-m-v1.5",
        is_matryoshka=True,
        architecture="BertModel",
        mteb_score=0.649088363,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-l-v2.0",
        is_matryoshka=True,
        architecture="XLMRobertaModel",
        mteb_score=0.712258299,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "Snowflake/snowflake-arctic-embed-m-v2.0",
        is_matryoshka=True,
        architecture="GteModel",
        mteb_score=0.706622444,
+        pooling_type="CLS",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
 ]
--- a/tests/models/language/pooling_mteb_test/test_st_projector.py
+++ b/tests/models/language/pooling_mteb_test/test_st_projector.py
@ -3,25 +3,31 @@
 import pytest

 from tests.models.utils import (
-    CLSPoolingEmbedModelInfo,
    EmbedModelInfo,
-    LASTPoolingEmbedModelInfo,
 )

-from .mteb_utils import mteb_test_embed_models
+from .mteb_embed_utils import mteb_test_embed_models

 # ST models with projector (Dense) layers
 ST_PROJECTOR_MODELS = [
-    CLSPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "TencentBAC/Conan-embedding-v1",
        architecture="BertModel",
        mteb_score=0.688611955,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
    ),
-    LASTPoolingEmbedModelInfo(
+    EmbedModelInfo(
        "google/embeddinggemma-300m",
        architecture="Gemma3TextModel",
        mteb_score=0.7473819294684156,
+        pooling_type="MEAN",
+        attn_type="encoder_only",
+        is_prefix_caching_supported=False,
+        is_chunked_prefill_supported=False,
        enable_test=True,
        dtype="float32",
    ),
--- a/tests/models/multimodal/conftest.py
+++ b/tests/models/multimodal/conftest.py
@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items):
        return

    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
-    # accuracy issues
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = {
        max_model_len=8192,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        num_logprobs=10 if current_platform.is_rocm() else 5,
    ),
    "intern_vl-hf": VLMTestInfo(
        models=["OpenGVLab/InternVL3-1B-hf"],
--- a/tests/models/multimodal/generation/test_keye.py
+++ b/tests/models/multimodal/generation/test_keye.py
@ -8,7 +8,7 @@ from PIL.Image import Image
 from transformers import AutoProcessor

 from vllm import LLM, EngineArgs, SamplingParams
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url

 MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"

@ -31,10 +31,7 @@ def test_keye_vl(
    question: str,
 ):
    images = [asset.pil_image for asset in image_assets]
-
-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
-    ]
+    image_urls = [encode_image_url(image) for image in images]

    engine_args = EngineArgs(
        model=MODEL_NAME,
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@ -267,7 +267,7 @@ def run_embedding_input_test(
    """Inference result should be the same between
    original image/video input and image/video embeddings input.
    """
-    from transformers import AutoProcessor  # noqa: F401
+    from transformers import AutoProcessor

    processor = AutoProcessor.from_pretrained(model)

--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@ -15,7 +15,7 @@ from transformers import AutoProcessor

 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform

@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
    """Build Dots.OCR specific prompt with OCR instructions."""
    # Use only stop_sign image for Dots.OCR
    image = images[0]  # Already filtered to stop_sign
-
-    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
+    image_url = encode_image_url(image)

    placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
    messages = [
@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
        config["model_name"], trust_remote_code=True
    )

-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
-    ]
+    image_urls = [encode_image_url(img) for img in images]
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
@ -225,9 +222,7 @@ def build_processor_prompt(images, config):

 def build_ovis_prompt(images, config):
    """Build Ovis2.5 specific prompt with custom format."""
-    image_urls = [
-        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
-    ]
+    image_urls = [encode_image_url(img) for img in images]

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):

    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
+    assert choice.message.content == "In the first audio clip, you hear a brief"
    assert choice.finish_reason == "length"
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -865,6 +865,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        # disable this temporarily until we support HF format
        is_available_online=False,
    ),
+    "VoxtralStreamingGeneration": _HfExamplesInfo(
+        "<place-holder>",
+        # disable this temporarily until we support HF format
+        is_available_online=False,
+    ),
    # [Encoder-decoder]
    "WhisperForConditionalGeneration": _HfExamplesInfo(
        "openai/whisper-large-v3-turbo",
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@ -38,7 +38,7 @@ def test_inference(
        max_num_seqs=32,
        default_torch_num_threads=1,
    ) as vllm_model:
-        vllm_output = vllm_model.llm.encode(prompt)
+        vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
        assert torch.equal(
            torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
        )
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@ -10,7 +10,7 @@ import torch
 import torch.nn.functional as F
 from transformers import PretrainedConfig

-from vllm.config.model import ModelConfig, ModelDType, RunnerOption
+from vllm.config.model import AttnTypeStr, ModelConfig, ModelDType, RunnerOption
 from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
 from vllm.multimodal.processing import InputProcessingContext
 from vllm.tokenizers import cached_tokenizer_from_config
@ -375,7 +375,10 @@ class ModelInfo:
    max_model_len: int | None = None
    hf_dtype: str = "float32"
    hf_overrides: dict[str, Any] | None = None
-    default_pooling_type: str = ""
+    pooling_type: str | None = None
+    attn_type: AttnTypeStr | None = None
+    is_prefix_caching_supported: bool | None = None
+    is_chunked_prefill_supported: bool | None = None
    enable_test: bool = True


@ -386,32 +389,12 @@ class EmbedModelInfo(ModelInfo):
    matryoshka_dimensions: list[int] | None = None


-@dataclass
-class CLSPoolingEmbedModelInfo(EmbedModelInfo):
-    default_pooling_type: str = "CLS"
-
-
-@dataclass
-class LASTPoolingEmbedModelInfo(EmbedModelInfo):
-    default_pooling_type: str = "LAST"
-
-
@dataclass
 class RerankModelInfo(ModelInfo):
    mteb_score: float | None = None
    chat_template_name: str | None = None


-@dataclass
-class CLSPoolingRerankModelInfo(RerankModelInfo):
-    default_pooling_type: str = "CLS"
-
-
-@dataclass
-class LASTPoolingRerankModelInfo(RerankModelInfo):
-    default_pooling_type: str = "LAST"
-
-
@dataclass
 class GenerateModelInfo(ModelInfo):
    hf_dtype: str = "auto"
--- a/tests/standalone_tests/pytorch_nightly_dependency.sh
+++ b/tests/standalone_tests/pytorch_nightly_dependency.sh
@ -4,6 +4,11 @@
 set -e
 set -x

+if command -v rocminfo >/dev/null 2>&1; then
+  echo "Skipping test for ROCm platform"
+  exit 0
+fi
+
 cd /vllm-workspace/

 rm -rf .venv
@ -36,7 +41,7 @@ if diff before.txt after.txt; then
  echo "torch version not overridden."
 else
  echo "torch version overridden by nightly_torch_test.txt, \
-  if the dependency is not triggered by the pytroch nightly test,\
+  if the dependency is not triggered by the pytorch nightly test,\
  please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
  exit 1
 fi
--- a/tests/tool_parsers/test_mistral_tool_parser.py
+++ b/tests/tool_parsers/test_mistral_tool_parser.py
@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
        "single_tool_add",
        "single_tool_weather",
        "multiple_tool_calls",
+        "complex",
+        "wrong_json",
    ],
    argnames=["model_output", "expected_tool_calls", "expected_content"],
    argvalues=[
@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
            ],
            None,
        ),
+        (
+            # Complex
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="bash",
+                        arguments=json.dumps(
+                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
+                        )[:-2],
+                    )
+                )
+            ],
+            "hi{hi",
+        ),
+        (
+            # Wrong json
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            [
+                ToolCall(
+                    function=FunctionCall(
+                        name="bash",
+                        arguments=json.dumps(
+                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
+                        ),
+                    )
+                )
+            ],
+            "hi{hi",
+        ),
    ],
 )
 def test_extract_tool_calls(
@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
        ),
        (
            # Complex
-            """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
            [
                ToolCall(
                    function=FunctionCall(
@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
                    )
                )
            ],
-            "",
+            "hi{hi",
        ),
    ],
 )
--- a/tests/utils.py
+++ b/tests/utils.py
@ -106,6 +106,7 @@ class RemoteOpenAIServer:
            env.update(env_dict)
        serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
        print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
+        print(f"Environment variables: {env}")
        self.proc: subprocess.Popen = subprocess.Popen(
            serve_cmd,
            env=env,
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
        )
    )
    assert block_hashes[1] == expected_hash2
+
+
+def test_auto_fit_max_model_len():
+    """Test that max_model_len=-1 auto-fits to available GPU memory."""
+    # Create config with original_max_model_len=-1 to trigger auto-fit
+    model_config = ModelConfig(max_model_len=1024)
+    # Simulate the user passing -1 by setting original_max_model_len
+    model_config.original_max_model_len = -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2  # 16KB per block per layer
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+
+    # With enough memory, max_model_len stays at the derived max
+    large_available_memory = mem_per_block_per_layer * 2 * 1024  # plenty of memory
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [large_available_memory]
+    )
+    assert vllm_config.model_config.max_model_len == 1024
+
+    # Reset for next test
+    model_config = ModelConfig(max_model_len=1024)
+    model_config.original_max_model_len = -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    # With limited memory, max_model_len should be reduced
+    # Need memory for at least max_model_len tokens
+    # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
+    limited_memory = mem_per_block_per_layer * 2 * 32
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [limited_memory]
+    )
+    # Should be reduced to fit in memory
+    assert vllm_config.model_config.max_model_len < 1024
+    assert vllm_config.model_config.max_model_len > 0
+
+
+def test_auto_fit_max_model_len_not_triggered():
+    """Test that auto-fit is not triggered when original_max_model_len is not -1."""
+    model_config = ModelConfig(max_model_len=16)
+    # original_max_model_len should be None by default, not -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+
+    # This should work normally without auto-fit
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
+    )
+    assert vllm_config.model_config.max_model_len == 16
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int):
    assert len(manager.block_pool.cached_block_hash_to_block) == 0


+def test_null_parent_block_hash():
+    block_size = 1
+    num_cached_blocks = 2
+    num_full_blocks = 4
+
+    pool = BlockPool(
+        num_gpu_blocks=8,
+        enable_caching=True,
+        hash_block_size=block_size,
+        enable_kv_cache_events=True,
+    )
+
+    req = make_request(
+        "req_null_parent",
+        prompt_token_ids=[10, 11, 12, 13],
+        block_size=block_size,
+        hash_fn=sha256,
+    )
+    assert len(req.block_hashes) == num_full_blocks
+
+    # Physical parent is `null_block` (no hash), while the logical parent hash
+    # still exists in `request.block_hashes[num_cached_blocks - 1]`.
+    assert pool.null_block.block_hash is None
+    new_blocks = pool.get_new_blocks(num_full_blocks - 1)
+    blocks = [
+        new_blocks[: num_cached_blocks - 1],
+        pool.null_block,  # physical parent
+        *new_blocks[num_cached_blocks - 1 :],
+    ]
+
+    pool.cache_full_blocks(
+        request=req,
+        blocks=blocks,
+        num_cached_blocks=num_cached_blocks,
+        num_full_blocks=num_full_blocks,
+        block_size=block_size,
+        kv_cache_group_id=0,
+    )
+
+    events = pool.take_events()
+    assert len(events) == 1
+    event = events[0]
+    assert isinstance(event, BlockStored)
+
+    expected_parent = kv_cache_utils.maybe_convert_block_hash(
+        req.block_hashes[num_cached_blocks - 1]
+    )
+    assert event.parent_block_hash == expected_parent
+    assert event.parent_block_hash is not None
+
+    expected_new_hashes = [
+        kv_cache_utils.maybe_convert_block_hash(h)
+        for h in req.block_hashes[num_cached_blocks:num_full_blocks]
+    ]
+    assert event.block_hashes == expected_new_hashes
+
+    # Ensure we didn't accidentally assign a hash to the null block.
+    assert pool.null_block.block_hash is None
+    # Sanity check: newly cached physical blocks should have hashes assigned.
+    assert blocks[num_cached_blocks].block_hash is not None
+    assert blocks[num_full_blocks - 1].block_hash is not None
+
+
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
 def test_kv_cache_events_with_lora(blocks_to_cache: int):
    """Test BlockStored events contain correct lora_id when using LoRA requests."""
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@ -31,7 +31,7 @@ import openai
 import requests

 from vllm.assets.image import ImageAsset
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url

 MAX_OUTPUT_LEN = 256

@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                "content": [
                    {
                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image;base64,{encode_image_base64(image_1)}"
-                        },
+                        "image_url": {"url": encode_image_url(image_1)},
                    },
                    {"type": "text", "text": "What's in this image?"},
                ],
@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                "content": [
                    {
                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image;base64,{encode_image_base64(image_2)}"
-                        },
+                        "image_url": {"url": encode_image_url(image_2)},
                    },
                    {
                        "type": "image_url",
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):

        # Use multi-abort to abort multiple requests at once
        abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
-        await engine.abort(abort_request_ids)
+        await engine.abort(abort_request_ids, internal=False)

        # Wait for all tasks to complete
        results = await asyncio.gather(*tasks, return_exceptions=True)
@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
        await asyncio.sleep(0.5)

        # Abort the request
-        await engine.abort(request_id)
+        await engine.abort(request_id, internal=False)

        # Wait for generation to complete and return final output
        final_output = await generated
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids

+_REQUEST_COUNTER = 0
+

 def make_request() -> EngineCoreRequest:
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
    return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
        prompt_token_ids=PROMPT_TOKENS,
        mm_features=None,
        sampling_params=SamplingParams(),
--- a/Show More
+++ b/Show More