Merge branch 'main' into upstream_mori_

2026-07-09 22:37:41 +08:00 · 2025-12-24 17:44:07 +08:00 · 2025-12-24 17:44:07 +08:00 · 8b5e2e69fb
commit 8b5e2e69fb
parent 03343276fa b41aeb3468
115 changed files with 2642 additions and 1446 deletions
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@ -162,7 +162,10 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/test_vision_embeds.py
  # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
  # TODO: Remove after next torch update
  - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s entrypoints/openai/test_vision_embeds.py
  - pytest -v -s entrypoints/test_chat_utils.py
 - label: Entrypoints Integration Test (API Server 2)
@ -219,6 +222,9 @@ steps:
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_symm_mem_allreduce.py
  commands:
  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
  # TODO: Remove when the bug is fixed in a future ROCm release
  - export TORCH_NCCL_BLOCKING_WAIT=1
  # test with torchrun tp=2 and external_dp=2
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  # test with torchrun tp=2 and pp=2
@ -267,9 +273,10 @@ steps:
  - vllm/v1/executor/uniproc_executor.py
  - vllm/v1/worker/gpu_worker.py
  commands:
  # https://github.com/NVIDIA/nccl/issues/1838
  #- export NCCL_CUMEM_HOST_ENABLE=0
  # test with torchrun tp=2 and dp=4 with ep
  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
  # TODO: Remove when the bug is fixed in a future ROCm release
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 - label: EPLB Algorithm Test # 5min
@ -979,7 +986,10 @@ steps:
    - export MIOPEN_DEBUG_CONV_GEMM=0
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing --ignore models/multimodal/pooling/test_prithvi_mae.py
    # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
    # TODO: Remove after next torch update
    - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s models/multimodal/pooling/test_prithvi_mae.py -m core_model
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Accuracy Eval (Small Models) # 5min
@ -1288,6 +1298,9 @@ steps:
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  commands:
  # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
  # TODO: Remove when the bug is fixed in a future ROCm release
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@ -1341,7 +1354,9 @@ steps:
  # end platform plugin tests
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
-  - pytest -v -s plugins_tests/test_io_processor_plugins.py
+  # Need tf32 to avoid conflicting precision issue with terratorch on ROCm.
  # TODO: Remove after next torch update
  - VLLM_FLOAT32_MATMUL_PRECISION="tf32" pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # begin stat_logger plugins test
@ -1510,7 +1525,7 @@ steps:
    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
-    - HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
+    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
    - pytest -v -s tests/v1/distributed/test_dbo.py
 ##### B200 test #####
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -9,16 +9,6 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping);
 // Note: the key_caches and value_caches vectors are constant but
 // not the Tensors they contain. The vectors need to be const refs
 // in order to satisfy pytorch's C++ operator registration code.
 void copy_blocks(std::vector<torch::Tensor> const& key_caches,
                 std::vector<torch::Tensor> const& value_caches,
                 const torch::Tensor& block_mapping);
 void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
                     const torch::Tensor& block_mapping);
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -119,94 +119,6 @@ __global__ void copy_blocks_mla_kernel(
 }  // namespace vllm
 // Note: the key_caches and value_caches vectors are constant but
 // not the Tensors they contain. The vectors need to be const refs
 // in order to satisfy pytorch's C++ operator registration code.
 void copy_blocks(std::vector<torch::Tensor> const& key_caches,
                 std::vector<torch::Tensor> const& value_caches,
                 const torch::Tensor& block_mapping) {
  int num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
  if (num_layers == 0) {
    return;
  }
  torch::Device cache_device = key_caches[0].device();
  TORCH_CHECK(cache_device.is_cuda());
  // Create data structures for the kernel.
  // Create an array of pointers to the key and value caches.
  int64_t key_cache_ptrs[num_layers];
  int64_t value_cache_ptrs[num_layers];
  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
    key_cache_ptrs[layer_idx] =
        reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
    value_cache_ptrs[layer_idx] =
        reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
  }
  // block_mapping is a 2D tensor with shape (num_pairs, 2).
  int num_pairs = block_mapping.size(0);
  // Move the data structures to the GPU.
  // NOTE: This synchronizes the CPU and GPU.
  torch::Tensor key_cache_ptrs_tensor =
      torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
          .to(cache_device);
  torch::Tensor value_cache_ptrs_tensor =
      torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
          .to(cache_device);
  // Launch the kernel.
  const int numel_per_block = key_caches[0][0].numel();
  dim3 grid(num_layers, num_pairs);
  dim3 block(std::min(1024, numel_per_block));
  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
      key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
        vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
            key_cache_ptrs_tensor.data_ptr<int64_t>(),
            value_cache_ptrs_tensor.data_ptr<int64_t>(),
            block_mapping.data_ptr<int64_t>(), numel_per_block);
      }));
 }
 // copy blocks kernel for MLA (assumes a joint KV-cache)
 void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
                     const torch::Tensor& block_mapping) {
  int num_layers = kv_caches.size();
  if (num_layers == 0) {
    return;
  }
  torch::Device cache_device = kv_caches[0].device();
  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
  std::vector<int64_t> cache_ptrs(num_layers);
  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
    cache_ptrs[layer_idx] =
        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
  }
  torch::Tensor cache_ptrs_tensor =
      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
          .to(cache_device);
  int num_pairs = block_mapping.size(0);
  // We use the stride instead of numel in case the cache is padded for memory
  // alignment reasons, we assume the blocks data (inclusive of any padding)
  // is contiguous in memory
  int mem_footprint_per_block = kv_caches[0].stride(0);
  dim3 grid(num_layers, num_pairs);
  dim3 block(std::min(1024, mem_footprint_per_block));
  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
            cache_ptrs_tensor.data_ptr<int64_t>(),
            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
      }));
 }
 namespace vllm {
 // Used to copy/convert one element
@ -539,9 +451,6 @@ __global__ void indexer_k_quant_and_cache_kernel(
  for (int i = 0; i < VEC_SIZE; i++) {
    amax = fmaxf(amax, fabsf(float(k_val_ptr[i])));
  }
 #ifndef USE_ROCM
  __syncwarp();
 #endif
  // Reduced amax
  for (int mask = 16; mask > 0; mask /= 2) {
@ -551,9 +460,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
    amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask));
 #endif
  }
-#ifndef USE_ROCM
+
  __syncwarp();
 #endif
 #if defined(__gfx942__)
  float scale = fmaxf(amax, 1e-4) / 224.0f;
 #else
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@ -35,7 +35,7 @@ template <typename Int>
 __host__ __device__ inline Int round_up(Int x, Int y) {
  static_assert(std::is_integral_v<Int>,
                "round_up argument must be integral type");
-  return (x + y - 1) / y * y;
+  return ((x + y - 1) / y) * y;
 }
 // Compute effective rows for grid configuration with swizzled SF layouts.
@ -61,37 +61,47 @@ __global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512))
  int sf_m = round_up<int>(numRows, 128);
  int sf_n_unpadded = numCols / CVT_FP4_SF_VEC_SIZE;
  int sf_n_int = round_up<int>(sf_n_unpadded, 4) / 4;
-  for (int row = numRows + blockIdx.x; row < sf_m; row += gridDim.x) {
+  int num_padded_cols = sf_n_int * 4 * CVT_FP4_SF_VEC_SIZE;
    // Each thread writes 4 uint32_t elements.
    for (int col = sf_n_unpadded + threadIdx.x * 4; col < sf_n_int;
         col += blockDim.x * 4) {
      SFout[row * sf_n_int + col] = 0x00;
    }
  }
  // Get the global scaling factor, which will be applied to the SF.
  // Note SFScale is the same as next GEMM's alpha, which is
  // (448.f / (Alpha_A / 6.f)).
  float const global_scale = SFScale == nullptr ? 1.0f : SFScale[0];
-  // Input tensor row/col loops.
+  // Iterate over all rows and cols including padded ones -
-  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+  //  ensures we visit every single scale factor address to initialize it.
-    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+  for (int rowIdx = blockIdx.x; rowIdx < sf_m; rowIdx += gridDim.x) {
    for (int colIdx = threadIdx.x;
         colIdx < num_padded_cols / CVT_FP4_ELTS_PER_THREAD;
         colIdx += blockDim.x) {
      int elem_idx = colIdx * CVT_FP4_ELTS_PER_THREAD;
      PackedVec in_vec;
      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
-      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+
-      // Get the output tensor offset.
+      // If we are outside valid rows OR outside valid columns -> Use Zeros
-      // Same as inOffset because 8 elements are packed into one uint32_t.
+      if (rowIdx >= numRows || elem_idx >= numCols) {
-      int64_t outOffset = inOffset;
+        memset(&in_vec, 0, sizeof(PackedVec));
-      auto& out_pos = out[outOffset];
+
      } else {
        // Valid Region: Load actual data
        in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
      }
      auto sf_out =
          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
                                             CVT_FP4_NUM_THREADS_PER_SF>(
              rowIdx, colIdx, numKTiles, SFout);
-      out_pos =
+      auto out_val =
          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, global_scale, sf_out);
      // We do NOT write output for padding because the 'out' tensor is not
      // padded.
      if (rowIdx < numRows && elem_idx < numCols) {
        // Same as inOffset because 8 elements are packed into one uint32_t.
        out[inOffset] = out_val;
      }
    }
  }
 }
@ -134,4 +144,4 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
        m, n, input_ptr, input_sf_ptr, reinterpret_cast<uint32_t*>(output_ptr),
        reinterpret_cast<uint32_t*>(sf_out));
  });
-}
+}
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -685,16 +685,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
  // Copy the cache blocks from src to dst.
  cache_ops.def(
      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
      "Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
  cache_ops.def(
      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
  // Reshape the key and value tensors and cache them.
  cache_ops.def(
      "reshape_and_cache(Tensor key, Tensor value,"
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -183,7 +183,7 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
 ARG USE_SCCACHE
-ARG SCCACHE_DOWNLOAD_URL=https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz
+ARG SCCACHE_DOWNLOAD_URL
 ARG SCCACHE_ENDPOINT
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
@ -201,10 +201,16 @@ ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build"
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && case "${TARGETPLATFORM}" in \
          linux/arm64) SCCACHE_ARCH="aarch64" ;; \
          linux/amd64) SCCACHE_ARCH="x86_64" ;; \
          *) echo "Unsupported TARGETPLATFORM for sccache: ${TARGETPLATFORM}" >&2; exit 1 ;; \
        esac \
        && export SCCACHE_DOWNLOAD_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
        && curl -L -o sccache.tar.gz ${SCCACHE_DOWNLOAD_URL} \
        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && sudo mv sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-${SCCACHE_ARCH}-unknown-linux-musl \
        && if [ ! -z ${SCCACHE_ENDPOINT} ] ; then export SCCACHE_ENDPOINT=${SCCACHE_ENDPOINT} ; fi \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@ -2,4 +2,4 @@
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
-Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
+You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
--- a/docs/deployment/integrations/llm-d.md
+++ b/docs/deployment/integrations/llm-d.md
@ -0,0 +1,5 @@
 # llm-d
 vLLM can be deployed with [llm-d](https://github.com/llm-d/llm-d), a Kubernetes-native distributed inference serving stack providing well-lit paths for anyone to serve large generative AI models at scale. It helps achieve the fastest "time to state-of-the-art (SOTA) performance" for key OSS models across most hardware accelerators and infrastructure providers.
 You can use vLLM with llm-d directly by following [this guide](https://llm-d.ai/docs/guide) or via [KServe's LLMInferenceService](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@ -12,6 +12,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 - [Helm](frameworks/helm.md)
 - [InftyAI/llmaz](integrations/llmaz.md)
 - [llm-d](integrations/llm-d.md)
 - [KAITO](integrations/kaito.md)
 - [KServe](integrations/kserve.md)
 - [Kthena](integrations/kthena.md)
--- a/setup.py
+++ b/setup.py
@ -50,15 +50,15 @@ elif not (sys.platform.startswith("linux") or sys.platform.startswith("darwin"))
        sys.platform,
    )
    VLLM_TARGET_DEVICE = "empty"
-elif (
+elif sys.platform.startswith("linux") and os.getenv("VLLM_TARGET_DEVICE") is None:
-    sys.platform.startswith("linux")
+    if torch.version.hip is not None:
-    and torch.version.cuda is None
+        VLLM_TARGET_DEVICE = "rocm"
-    and os.getenv("VLLM_TARGET_DEVICE") is None
+        logger.info("Auto-detected ROCm")
-    and torch.version.hip is None
+    elif torch.version.cuda is not None:
-):
+        VLLM_TARGET_DEVICE = "cuda"
-    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
+        logger.info("Auto-detected CUDA")
-    # fallback to cpu
+    else:
-    VLLM_TARGET_DEVICE = "cpu"
+        VLLM_TARGET_DEVICE = "cpu"
 def is_sccache_available() -> bool:
@ -108,20 +108,26 @@ class cmake_build_ext(build_ext):
                num_jobs = os.cpu_count()
        nvcc_threads = None
-        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
+        if _is_cuda() and CUDA_HOME is not None:
-            # `nvcc_threads` is either the value of the NVCC_THREADS
+            try:
-            # environment variable (if defined) or 1.
+                nvcc_version = get_nvcc_cuda_version()
-            # when it is set, we reduce `num_jobs` to avoid
+                if nvcc_version >= Version("11.2"):
-            # overloading the system.
+                    # `nvcc_threads` is either the value of the NVCC_THREADS
-            nvcc_threads = envs.NVCC_THREADS
+                    # environment variable (if defined) or 1.
-            if nvcc_threads is not None:
+                    # when it is set, we reduce `num_jobs` to avoid
-                nvcc_threads = int(nvcc_threads)
+                    # overloading the system.
-                logger.info(
+                    nvcc_threads = envs.NVCC_THREADS
-                    "Using NVCC_THREADS=%d as the number of nvcc threads.", nvcc_threads
+                    if nvcc_threads is not None:
-                )
+                        nvcc_threads = int(nvcc_threads)
-            else:
+                        logger.info(
-                nvcc_threads = 1
+                            "Using NVCC_THREADS=%d as the number of nvcc threads.",
-            num_jobs = max(1, num_jobs // nvcc_threads)
+                            nvcc_threads,
                        )
                    else:
                        nvcc_threads = 1
                    num_jobs = max(1, num_jobs // nvcc_threads)
            except Exception as e:
                logger.warning("Failed to get NVCC version: %s", e)
        return num_jobs, nvcc_threads
@ -199,9 +205,9 @@ class cmake_build_ext(build_ext):
            # Default build tool to whatever cmake picks.
            build_tool = []
        # Make sure we use the nvcc from CUDA_HOME
-        if _is_cuda():
+        if _is_cuda() and CUDA_HOME is not None:
            cmake_args += [f"-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc"]
-        elif _is_hip():
+        elif _is_hip() and ROCM_HOME is not None:
            cmake_args += [f"-DROCM_PATH={ROCM_HOME}"]
        other_cmake_args = os.environ.get("CMAKE_ARGS")
@ -339,6 +345,89 @@ class precompiled_wheel_utils:
            wheels = json.loads(resp.read().decode("utf-8"))
        return wheels, repo_url
    @staticmethod
    def is_rocm_system() -> bool:
        """Detect ROCm without relying on torch (for build environment)."""
        if os.getenv("ROCM_PATH"):
            return True
        if os.path.isdir("/opt/rocm"):
            return True
        if which("rocminfo") is not None:
            return True
        try:
            import torch
            return torch.version.hip is not None
        except ImportError:
            return False
    @staticmethod
    def find_local_rocm_wheel() -> str | None:
        """Search for a local vllm wheel in common locations."""
        import glob
        for pattern in ["/vllm-workspace/dist/vllm-*.whl", "./dist/vllm-*.whl"]:
            wheels = glob.glob(pattern)
            if wheels:
                return sorted(wheels)[-1]
        return None
    @staticmethod
    def fetch_wheel_from_pypi_index(index_url: str, package: str = "vllm") -> str:
        """Fetch the latest wheel URL from a PyPI-style simple index."""
        import platform
        from html.parser import HTMLParser
        from urllib.parse import urljoin
        from urllib.request import urlopen
        arch = platform.machine()
        class WheelLinkParser(HTMLParser):
            def __init__(self):
                super().__init__()
                self.wheels = []
            def handle_starttag(self, tag, attrs):
                if tag == "a":
                    for name, value in attrs:
                        if name == "href" and value.endswith(".whl"):
                            self.wheels.append(value)
        simple_url = f"{index_url.rstrip('/')}/{package}/"
        print(f"Fetching wheel list from {simple_url}")
        with urlopen(simple_url) as resp:
            html = resp.read().decode("utf-8")
        parser = WheelLinkParser()
        parser.feed(html)
        for wheel in reversed(parser.wheels):
            if arch in wheel:
                if wheel.startswith("http"):
                    return wheel
                return urljoin(simple_url, wheel)
        raise ValueError(f"No compatible wheel found for {arch} at {simple_url}")
    @staticmethod
    def determine_wheel_url_rocm() -> tuple[str, str | None]:
        """Determine the precompiled wheel for ROCm."""
        # Search for local wheel first
        local_wheel = precompiled_wheel_utils.find_local_rocm_wheel()
        if local_wheel is not None:
            print(f"Found local ROCm wheel: {local_wheel}")
            return local_wheel, None
        # Fall back to AMD's PyPI index
        index_url = os.getenv(
            "VLLM_ROCM_WHEEL_INDEX", "https://pypi.amd.com/vllm-rocm/simple"
        )
        print(f"Fetching ROCm precompiled wheel from {index_url}")
        wheel_url = precompiled_wheel_utils.fetch_wheel_from_pypi_index(index_url)
        download_filename = wheel_url.split("/")[-1].split("#")[0]
        print(f"Using ROCm precompiled wheel: {wheel_url}")
        return wheel_url, download_filename
    @staticmethod
    def determine_wheel_url() -> tuple[str, str | None]:
        """
@ -359,6 +448,11 @@ class precompiled_wheel_utils:
            print(f"Using user-specified precompiled wheel location: {wheel_location}")
            return wheel_location, None
        else:
            # ROCm: use local wheel or AMD's PyPI index
            # TODO: When we have ROCm nightly wheels, we can update this logic.
            if precompiled_wheel_utils.is_rocm_system():
                return precompiled_wheel_utils.determine_wheel_url_rocm()
            import platform
            arch = platform.machine()
@ -465,6 +559,8 @@ class precompiled_wheel_utils:
                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                    "vllm/cumem_allocator.abi3.so",
                    # ROCm-specific libraries
                    "vllm/_rocm_C.abi3.so",
                ]
                flash_attn_regex = re.compile(
@ -601,6 +697,8 @@ def get_rocm_version():
    # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
    # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
    try:
        if ROCM_HOME is None:
            return None
        librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
        if not librocm_core_file.is_file():
            return None
@ -745,7 +843,9 @@ if _is_hip():
 if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+    if envs.VLLM_USE_PRECOMPILED or (
        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
    ):
        # FA3 requires CUDA 12.3 or later
        ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
        # Optional since this doesn't get built (produce an .so file) when
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -511,6 +511,16 @@ def test_human_readable_model_len():
    args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
    assert args.max_model_len == 10212345123456
    # Special value -1 for auto-fit to GPU memory
    args = parser.parse_args(["--max-model-len", "-1"])
    assert args.max_model_len == -1
    # 'auto' is an alias for -1
    args = parser.parse_args(["--max-model-len", "auto"])
    assert args.max_model_len == -1
    args = parser.parse_args(["--max-model-len", "AUTO"])
    assert args.max_model_len == -1
    # Invalid (do not allow decimals with binary multipliers)
    for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
        with pytest.raises(ArgumentError):
--- a/tests/entrypoints/openai/conftest.py
+++ b/tests/entrypoints/openai/conftest.py
@ -5,6 +5,30 @@ import pytest
 from vllm.assets.audio import AudioAsset
 def add_attention_backend(server_args, attention_config):
    """Append attention backend CLI arg if specified.
    Args:
        server_args: List of server arguments to extend in-place.
        attention_config: Dict with 'backend' key, or None.
    """
    if attention_config and "backend" in attention_config:
        server_args.extend(["--attention-backend", attention_config["backend"]])
@pytest.fixture(scope="module")
 def rocm_aiter_fa_attention():
    """Return attention config for transcription/translation tests on ROCm.
    On ROCm, audio tests require ROCM_AITER_FA attention backend.
    """
    from vllm.platforms import current_platform
    if current_platform.is_rocm():
        return {"backend": "ROCM_AITER_FA"}
    return None
@pytest.fixture
 def mary_had_lamb():
    path = AudioAsset("mary_had_lamb").get_local_path()
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio
 from vllm.assets.audio import AudioAsset
-from vllm.multimodal.utils import encode_audio_base64, fetch_audio
+from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
 from ...utils import RemoteOpenAIServer
@ -53,6 +53,14 @@ def base64_encoded_audio() -> dict[str, str]:
    }
@pytest.fixture(scope="session")
 def url_encoded_audio() -> dict[str, str]:
    return {
        audio_url: encode_audio_url(*fetch_audio(audio_url))
        for audio_url in TEST_AUDIO_URLS
    }
 def dummy_messages_from_audio_url(
    audio_urls: str | list[str],
    content_text: str = "What's happening in this audio?",
@ -149,11 +157,9 @@ async def test_single_chat_session_audio_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
    audio_url: str,
-    base64_encoded_audio: dict[str, str],
+    url_encoded_audio: dict[str, str],
 ):
-    messages = dummy_messages_from_audio_url(
+    messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
        f"data:audio/wav;base64,{base64_encoded_audio[audio_url]}"
    )
    # test single completion
    chat_completion = await client.chat.completions.create(
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@ -254,12 +254,11 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
        {"role": "system", "content": "you are a helpful assistant"},
        {"role": "user", "content": "what is 1+1?"},
    ]
    # test single completion
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=5,
        logprobs=True,
        top_logprobs=5,
    )
@ -267,13 +266,14 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=37, total_tokens=47
+        completion_tokens=5, prompt_tokens=37, total_tokens=42
    )
    message = choice.message
-    assert message.content is not None and len(message.content) >= 10
+    assert message.content is not None and len(message.content) >= 5
    assert message.role == "assistant"
    messages.append({"role": "assistant", "content": message.content})
@ -282,7 +282,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, model_name: str):
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=5,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0
--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@ -39,6 +39,7 @@ def server(request: pytest.FixtureRequest):
        "2",
        *passed_params,
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@ -504,7 +504,11 @@ async def test_web_search(client: OpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_code_interpreter(client: OpenAI, model_name: str):
-    response = await client.responses.create(
+    # Code interpreter may need more time for container init + code execution
    timeout_value = client.timeout * 3
    client_with_timeout = client.with_options(timeout=timeout_value)
    response = await client_with_timeout.responses.create(
        model=model_name,
        # TODO: Ideally should be able to set max tool calls
        # to prevent multi-turn, but it is not currently supported
@ -868,6 +872,7 @@ async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.flaky(reruns=3)
 async def test_function_call_with_previous_input_messages(
    client: OpenAI, model_name: str
 ):
--- a/tests/entrypoints/openai/test_serving_tokens.py
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@ -93,6 +93,7 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
        add_generation_prompt=True,
        enable_thinking=False,  # default with Qwen3
    )
    for ignore_eos in [True, False]:
        payload = {
            "model": MODEL_NAME,
@ -108,9 +109,8 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
        }
        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
        generate_data = generate_resp.json()
-        generate_res = tokenizer.decode(
+        gen_token_ids = generate_data["choices"][0]["token_ids"]
-            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+        generate_res = tokenizer.decode(gen_token_ids, skip_special_tokens=True)
        )
        payload = {
            "model": MODEL_NAME,
@ -119,12 +119,33 @@ async def test_same_response_as_chat_completions(client, tokenizer, messages):
            "temperature": 0.0,
            "stream": False,
            "ignore_eos": ignore_eos,
-            "chat_template_kwargs": dict(enable_thinking=False),
+            "chat_template_kwargs": {"enable_thinking": False},
        }
        completions_resp = await client.post("/v1/chat/completions", json=payload)
        completions_data = completions_resp.json()
        completions_res = completions_data["choices"][0]["message"]["content"]
        if ignore_eos:
            # When ignoring EOS, only compare up to the first EOS token
            # Post-EOS generation is undefined and may differ
            eos_tokens = {
                tokenizer.eos_token_id,
                *tokenizer.additional_special_tokens_ids,
            }
            # Find first EOS in generated tokens
            eos_pos = None
            for i, tid in enumerate(gen_token_ids):
                if tid in eos_tokens:
                    eos_pos = i
                    break
            if eos_pos is not None:
                gen_token_ids_truncated = gen_token_ids[:eos_pos]
                generate_res = tokenizer.decode(
                    gen_token_ids_truncated, skip_special_tokens=True
                )
                # Truncate completions_res to same length for comparison
                completions_res = completions_res[: len(generate_res)]
        assert generate_res == completions_res
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@ -9,10 +9,16 @@ import time
 import openai
 import pytest
 from vllm.platforms import current_platform
 from vllm.utils.network_utils import get_open_port
 MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 # GPU initialization might take take longer
 _IS_ROCM = current_platform.is_rocm()
 _SERVER_STARTUP_TIMEOUT = 120
 _PROCESS_EXIT_TIMEOUT = 15
@pytest.mark.asyncio
 async def test_shutdown_on_engine_failure():
@ -45,9 +51,11 @@ async def test_shutdown_on_engine_failure():
            "2",
            "--disable-frontend-multiprocessing",
        ],
-        stdout=subprocess.PIPE,
+        # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
-        stderr=subprocess.PIPE,
+        # stdout/stderr pipes are enabled during ROCm GPU initialization.
-        text=True,
+        stdout=None if _IS_ROCM else subprocess.PIPE,
        stderr=None if _IS_ROCM else subprocess.PIPE,
        text=None if _IS_ROCM else True,
        preexec_fn=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN),
    )
@ -61,7 +69,7 @@ async def test_shutdown_on_engine_failure():
    )
    # Poll until server is ready
-    while time.time() - start_time < 30:
+    while time.time() - start_time < _SERVER_STARTUP_TIMEOUT:
        try:
            await client.completions.create(
                model=MODEL_NAME, prompt="Hello", max_tokens=1
@ -70,14 +78,18 @@ async def test_shutdown_on_engine_failure():
        except Exception:
            time.sleep(0.5)
            if proc.poll() is not None:
-                stdout, stderr = proc.communicate(timeout=1)
+                if _IS_ROCM:
-                pytest.fail(
+                    pytest.fail(f"Server died during startup: {proc.returncode}")
-                    f"Server died during startup. stdout: {stdout}, stderr: {stderr}"
+                else:
-                )
+                    stdout, stderr = proc.communicate(timeout=1)
                    pytest.fail(
                        f"Server died during startup. "
                        f"stdout: {stdout}, stderr: {stderr}"
                    )
    else:
        proc.terminate()
-        proc.wait(timeout=5)
+        proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
-        pytest.fail("Server failed to start in 30 seconds")
+        pytest.fail(f"Server failed to start in {_SERVER_STARTUP_TIMEOUT} seconds")
    # Kill server to simulate crash
    proc.terminate()
@ -89,5 +101,5 @@ async def test_shutdown_on_engine_failure():
            model=MODEL_NAME, prompt="This should fail", max_tokens=1
        )
-    return_code = proc.wait(timeout=5)
+    return_code = proc.wait(timeout=_PROCESS_EXIT_TIMEOUT)
    assert return_code is not None
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@ -7,6 +7,7 @@ import json
 import pytest
 from ...utils import RemoteOpenAIServer
 from .conftest import add_attention_backend
 MISTRAL_FORMAT_ARGS = [
    "--tokenizer_mode",
@ -20,12 +21,14 @@ MISTRAL_FORMAT_ARGS = [
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
-async def test_basic_audio(mary_had_lamb, model_name):
+async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
    server_args = ["--enforce-eager"]
    if model_name.startswith("mistralai"):
        server_args += MISTRAL_FORMAT_ARGS
    add_attention_backend(server_args, rocm_aiter_fa_attention)
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
@ -44,8 +47,13 @@ async def test_basic_audio(mary_had_lamb, model_name):
@pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
    # ROCm SPECIFIC CONFIGURATION:
    # To ensure the test passes on ROCm, we modify the max model length to 512.
    # We DO NOT apply this to other platforms to maintain strict upstream parity.
    from vllm.platforms import current_platform
    model_name = "ibm-granite/granite-speech-3.3-2b"
    lora_model_name = "speech"
    server_args = [
@ -56,11 +64,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
        "--max-num-seqs",
        "1",
    ]
    add_attention_backend(server_args, rocm_aiter_fa_attention)
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
@ -79,12 +89,14 @@ async def test_basic_audio_with_lora(mary_had_lamb):
@pytest.mark.asyncio
-async def test_basic_audio_gemma(foscolo):
+async def test_basic_audio_gemma(foscolo, rocm_aiter_fa_attention):
    # Gemma accuracy on some of the audio samples we use is particularly bad,
    # hence we use a different one here. WER is evaluated separately.
    model_name = "google/gemma-3n-E2B-it"
    server_args = ["--enforce-eager"]
    add_attention_backend(server_args, rocm_aiter_fa_attention)
    with RemoteOpenAIServer(
        model_name, server_args, max_wait_seconds=480
    ) as remote_server:
--- a/tests/entrypoints/openai/test_translation_validation.py
+++ b/tests/entrypoints/openai/test_translation_validation.py
@ -14,16 +14,26 @@ import pytest_asyncio
 import soundfile as sf
 from ...utils import RemoteOpenAIServer
 from .conftest import add_attention_backend
 SERVER_ARGS = ["--enforce-eager"]
 def _get_server_args(attention_config):
    """Get server args with attention backend if specified."""
    args = SERVER_ARGS.copy()
    add_attention_backend(args, attention_config)
    return args
@pytest.fixture(
    scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
 )
-def server(request):
+def server(request, rocm_aiter_fa_attention):
    # Parametrize over model name
-    with RemoteOpenAIServer(request.param, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
        request.param, _get_server_args(rocm_aiter_fa_attention)
    ) as remote_server:
        yield remote_server, request.param
@ -35,10 +45,12 @@ async def client_and_model(server):
@pytest.mark.asyncio
-async def test_non_asr_model(foscolo):
+async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
    # text to text model
    model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
+    with RemoteOpenAIServer(
        model_name, _get_server_args(rocm_aiter_fa_attention)
    ) as remote_server:
        client = remote_server.get_async_client()
        res = await client.audio.translations.create(
            model=model_name, file=foscolo, temperature=0.0
@ -49,8 +61,13 @@ async def test_non_asr_model(foscolo):
@pytest.mark.asyncio
-async def test_basic_audio_with_lora(mary_had_lamb):
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    """Ensure STT (translate) requests can pass LoRA through to generate."""
    # ROCm SPECIFIC CONFIGURATION:
    # To ensure the test passes on ROCm, we modify the max model length to 512.
    # We DO NOT apply this to other platforms to maintain strict upstream parity.
    from vllm.platforms import current_platform
    # NOTE - careful to call this test before the module scoped server
    # fixture, otherwise it'll OOMkill the CI
    model_name = "ibm-granite/granite-speech-3.3-2b"
@ -63,11 +80,13 @@ async def test_basic_audio_with_lora(mary_had_lamb):
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
-        "2048",
+        "512" if current_platform.is_rocm() else "2048",
        "--max-num-seqs",
        "1",
    ]
    add_attention_backend(server_args, rocm_aiter_fa_attention)
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@ -7,7 +7,8 @@ import openai
 import pytest
 import pytest_asyncio
-from vllm.multimodal.utils import encode_video_base64, fetch_video
+from vllm.multimodal.utils import encode_video_url, fetch_video
 from vllm.platforms import current_platform
 from ...utils import RemoteOpenAIServer
@ -37,7 +38,16 @@ def server():
        json.dumps({"video": MAXIMUM_VIDEOS}),
    ]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # ROCm: Increase timeouts to handle potential network delays and slower
    # video processing when downloading multiple videos from external sources
    env_overrides = {}
    if current_platform.is_rocm():
        env_overrides = {
            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
        }
    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
        yield remote_server
@ -48,9 +58,9 @@ async def client(server):
@pytest.fixture(scope="session")
-def base64_encoded_video() -> dict[str, str]:
+def url_encoded_video() -> dict[str, str]:
    return {
-        video_url: encode_video_base64(fetch_video(video_url)[0])
+        video_url: encode_video_url(fetch_video(video_url)[0])
        for video_url in TEST_VIDEO_URLS
    }
@ -175,11 +185,9 @@ async def test_single_chat_session_video_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
    video_url: str,
-    base64_encoded_video: dict[str, str],
+    url_encoded_video: dict[str, str],
 ):
-    messages = dummy_messages_from_video_url(
+    messages = dummy_messages_from_video_url(url_encoded_video[video_url])
        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
    )
    # test single completion
    chat_completion = await client.chat.completions.create(
@ -223,11 +231,9 @@ async def test_single_chat_session_video_base64encoded_beamsearch(
    client: openai.AsyncOpenAI,
    model_name: str,
    video_url: str,
-    base64_encoded_video: dict[str, str],
+    url_encoded_video: dict[str, str],
 ):
-    messages = dummy_messages_from_video_url(
+    messages = dummy_messages_from_video_url(url_encoded_video[video_url])
        f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
    )
    chat_completion = await client.chat.completions.create(
        model=model_name,
@ -291,6 +297,11 @@ async def test_chat_streaming_video(
@pytest.mark.parametrize(
    "video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
 )
@pytest.mark.flaky(
    reruns=2,
    reruns_delay=5,
    condition=current_platform.is_rocm(),
 )
 async def test_multi_video_input(
    client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
 ):
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@ -9,7 +9,8 @@ import pytest_asyncio
 from transformers import AutoProcessor
 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_url, fetch_image
 from vllm.platforms import current_platform
 from ...utils import RemoteOpenAIServer
@ -35,7 +36,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
    ],
    [
        "The image shows a Venn diagram with three over",
-        "The image shows a colorful Venn diagram with",
+        "The image displays a Venn diagram with three over",
    ],
    [
        "This image displays a gradient of colors ranging from",
@ -43,6 +44,27 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
    ],
 ]
 EXPECTED_MM_BEAM_SEARCH_RES_ROCM = [
    # MultiHeadAttention attn_backend: FLASH_ATTN
    # with Triton Attention backend
    [
        "The image shows a wooden boardwalk leading through a",
        "The image shows a wooden boardwalk extending into a",
    ],
    [
        "The image shows two parrots perched on",
        "The image shows two birds perched on a cur",
    ],
    [
        "The image shows a Venn diagram with three over",
        "The image contains a Venn diagram with three over",
    ],
    [
        "This image displays a gradient of colors ranging from",
        "This image displays a gradient of colors transitioning from",
    ],
 ]
@pytest.fixture(scope="module")
 def server():
@ -59,7 +81,16 @@ def server():
        json.dumps({"image": MAXIMUM_IMAGES}),
    ]
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    # ROCm: Increase timeouts to handle potential network delays and slower
    # video processing when downloading multiple videos from external sources
    env_overrides = {}
    if current_platform.is_rocm():
        env_overrides = {
            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
        }
    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
        yield remote_server
@ -70,11 +101,9 @@ async def client(server):
@pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
    return {
-        image_asset: encode_image_base64(
+        image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
            local_asset_server.get_image_asset(image_asset)
        )
        for image_asset in TEST_IMAGE_ASSETS
    }
@ -234,11 +263,11 @@ async def test_single_chat_session_image_base64encoded(
    model_name: str,
    raw_image_url: str,
    image_url: str,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
    content_text = "What's in this image?"
    messages = dummy_messages_from_image_url(
-        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",
+        url_encoded_image[raw_image_url],
        content_text,
    )
@ -288,15 +317,20 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
    client: openai.AsyncOpenAI,
    model_name: str,
    image_idx: int,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
    # ROCm: Switch expected results based on platform
    from vllm.platforms import current_platform
    # NOTE: This test also validates that we pass MM data through beam search
    raw_image_url = TEST_IMAGE_ASSETS[image_idx]
    expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
-    messages = dummy_messages_from_image_url(
+    if current_platform.is_rocm():
-        f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}"
+        expected_res = EXPECTED_MM_BEAM_SEARCH_RES_ROCM[image_idx]
-    )
+    else:
        expected_res = EXPECTED_MM_BEAM_SEARCH_RES[image_idx]
    messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
    chat_completion = await client.chat.completions.create(
        model=model_name,
--- a/tests/entrypoints/openai/test_vision_embeds.py
+++ b/tests/entrypoints/openai/test_vision_embeds.py
@ -33,6 +33,7 @@ def _terratorch_dummy_messages():
    ]
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
 )
--- a/tests/entrypoints/pooling/basic/test_encode.py
+++ b/tests/entrypoints/pooling/basic/test_encode.py
@ -9,11 +9,6 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 MODEL_NAME = "intfloat/multilingual-e5-small"
 PROMPTS = [
@ -35,6 +30,12 @@ TOKEN_IDS = [
@pytest.fixture(scope="module")
 def llm():
    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
    # that supports encoder-only models on ROCm.
    attention_config = None
    if current_platform.is_rocm():
        attention_config = {"backend": "FLEX_ATTENTION"}
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -44,6 +45,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
        attention_config=attention_config,
    )
    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/basic/test_truncation.py
+++ b/tests/entrypoints/pooling/basic/test_truncation.py
@ -9,11 +9,6 @@ import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
 max_model_len = 128
@ -44,6 +39,10 @@ def server():
        str(max_model_len),
    ]
    # ROCm: Use Flex Attention to support encoder-only self-attention.
    if current_platform.is_rocm():
        args.extend(["--attention-backend", "FLEX_ATTENTION"])
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/pooling/embed/conftest.py
+++ b/tests/entrypoints/pooling/embed/conftest.py
@ -0,0 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling embed tests."""
 import warnings
 import torch
 from vllm.platforms import current_platform
 def pytest_collection_modifyitems(config, items):
    """Configure ROCm-specific settings based on collected tests."""
    if not current_platform.is_rocm():
        return
    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
    torch.backends.cuda.enable_math_sdp(True)
    warnings.warn(
        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
        "to avoid HuggingFace Transformers accuracy issues",
        UserWarning,
        stacklevel=1,
    )
--- a/tests/entrypoints/pooling/embed/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
@ -13,11 +13,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 MODEL_NAME = "intfloat/e5-small"
@ -28,6 +23,10 @@ MAIN_SCORE = 0.7422994752439667
 def server():
    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
    # ROCm: Use Flex Attention to support encoder-only self-attention.
    if current_platform.is_rocm():
        args.extend(["--attention-backend", "FLEX_ATTENTION"])
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/pooling/embed/test_offline.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@ -11,11 +11,6 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 MODEL_NAME = "intfloat/multilingual-e5-small"
 prompts = ["The chef prepared a delicious meal."]
@ -23,6 +18,12 @@ prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(scope="module")
 def llm():
    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
    # that supports encoder-only models on ROCm.
    attention_config = None
    if current_platform.is_rocm():
        attention_config = {"backend": "FLEX_ATTENTION"}
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -32,6 +33,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
        attention_config=attention_config,
    )
    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/embed/test_online.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@ -28,16 +28,20 @@ from vllm.utils.serial_utils import (
    decode_pooling_output,
 )
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"
 if current_platform.is_rocm():
    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
    torch.backends.cuda.enable_math_sdp(True)
@pytest.fixture(scope="module")
 def server():
    args = [
@ -53,6 +57,10 @@ def server():
        DUMMY_CHAT_TEMPLATE,
    ]
    # ROCm: Use Flex Attention to support encoder-only self-attention.
    if current_platform.is_rocm():
        args.extend(["--attention-backend", "FLEX_ATTENTION"])
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/pooling/embed/test_online_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@ -14,11 +14,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 MODELS = [
    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
    EmbedModelInfo(
@ -62,6 +57,10 @@ def server(model_info, dtype: str):
            ["--trust_remote_code", "--hf_overrides", '{"matryoshka_dimensions":[256]}']
        )
    # ROCm: Use Flex Attention to support encoder-only self-attention.
    if current_platform.is_rocm():
        args.extend(["--attention-backend", "FLEX_ATTENTION"])
    with RemoteOpenAIServer(model_info.name, args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/pooling/embed/test_online_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@ -18,11 +18,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 def _generate_random_text(word_count: int) -> str:
    """Generate random text with approximately the specified word count."""
@ -228,6 +223,10 @@ def server_with_chunked_processing():
        "0.8",
    ]
    # ROCm: Use Flex Attention to support encoder-only self-attention.
    if current_platform.is_rocm():
        args.extend(["--attention-backend", "FLEX_ATTENTION"])
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@ -10,7 +10,7 @@ from transformers import AutoProcessor
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.base import MediaWithBytes
-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import fetch_image
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2
@ -48,14 +48,6 @@ def server():
        yield remote_server
@pytest.fixture(scope="session")
 def base64_encoded_image(local_asset_server) -> dict[str, str]:
    return {
        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
        for image_url in TEST_IMAGE_ASSETS
    }
 def get_hf_prompt_tokens(model_name, content, image_url):
    processor = AutoProcessor.from_pretrained(
        model_name, trust_remote_code=True, num_crops=4
--- a/tests/entrypoints/pooling/score/test_correctness_mteb.py
+++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py
@ -15,11 +15,6 @@ from tests.models.language.pooling_mteb_test.mteb_utils import (
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@ -30,6 +25,10 @@ st_main_score = 0.33457
 def server():
    args = ["--runner", "pooling", "--enforce-eager", "--disable-uvicorn-access-log"]
    # ROCm: Use Flex Attention to support encoder-only self-attention.
    if current_platform.is_rocm():
        args.extend(["--attention-backend", "FLEX_ATTENTION"])
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/pooling/score/test_offline.py
+++ b/tests/entrypoints/pooling/score/test_offline.py
@ -11,16 +11,17 @@ from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
@pytest.fixture(scope="module")
 def llm():
    # ROCm: Use FLEX_ATTENTION backend as it's the only attention backend
    # that supports encoder-only models on ROCm.
    attention_config = None
    if current_platform.is_rocm():
        attention_config = {"backend": "FLEX_ATTENTION"}
    # pytest caches the fixture so we use weakref.proxy to
    # enable garbage collection
    llm = LLM(
@ -30,6 +31,7 @@ def llm():
        gpu_memory_utilization=0.75,
        enforce_eager=True,
        seed=0,
        attention_config=attention_config,
    )
    yield weakref.proxy(llm)
--- a/tests/entrypoints/pooling/score/test_online_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@ -11,11 +11,6 @@ from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.entrypoints.pooling.score.protocol import RerankResponse
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
@ -24,6 +19,10 @@ DTYPE = "bfloat16"
 def server():
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
    # ROCm: Use Flex Attention to support encoder-only self-attention.
    if current_platform.is_rocm():
        args.extend(["--attention-backend", "FLEX_ATTENTION"])
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/pooling/score/test_online_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@ -12,11 +12,6 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.pooling.score.protocol import ScoreResponse
 from vllm.platforms import current_platform
 if current_platform.is_rocm():
    pytest.skip(
        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
    )
 MODELS = [
    {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
    {"name": "BAAI/bge-base-en-v1.5", "is_cross_encoder": False},
@ -44,6 +39,10 @@ def model(request):
 def server(model: dict[str, Any]):
    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
    # ROCm: Use Flex Attention to support encoder-only self-attention.
    if current_platform.is_rocm():
        args.extend(["--attention-backend", "FLEX_ATTENTION"])
    with RemoteOpenAIServer(model["name"], args) as remote_server:
        yield remote_server
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@ -25,9 +25,9 @@ from vllm.entrypoints.chat_utils import (
 )
 from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (
-    encode_audio_base64,
+    encode_audio_url,
-    encode_image_base64,
+    encode_image_url,
-    encode_video_base64,
+    encode_video_url,
 )
 from vllm.tokenizers import get_tokenizer
 from vllm.tokenizers.mistral import MistralTokenizer
@ -141,22 +141,19 @@ def mistral_model_config():
@pytest.fixture(scope="module")
 def image_url():
    image = ImageAsset("cherry_blossom")
-    base64 = encode_image_base64(image.pil_image)
+    return encode_image_url(image.pil_image)
    return f"data:image/jpeg;base64,{base64}"
@pytest.fixture(scope="module")
 def video_url():
    video = VideoAsset("baby_reading", 1)
-    base64 = encode_video_base64(video.np_ndarrays)
+    return encode_video_url(video.np_ndarrays)
    return f"data:video/jpeg;base64,{base64}"
@pytest.fixture(scope="module")
 def audio_url():
    audio = AudioAsset("mary_had_lamb")
-    base64 = encode_audio_base64(*audio.audio_and_sample_rate)
+    return encode_audio_url(*audio.audio_and_sample_rate)
    return f"data:audio/ogg;base64,{base64}"
 def _assert_mm_data_is_image_input(
--- a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
@ -0,0 +1,11 @@
 model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
 accuracy_threshold: 0.85
 num_questions: 1319
 num_fewshot: 5
 server_args: >-
  --max-model-len 4096
  --tensor-parallel-size 2
  --enable-expert-parallel
  --async-scheduling
 env:
  VLLM_USE_FLASHINFER_MOE_FP8: "1"
--- a/tests/evals/gsm8k/configs/models-blackwell.txt
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@ -4,3 +4,4 @@ Qwen1.5-MoE-W4A16-CT.yaml
 DeepSeek-V2-Lite-Instruct-FP8.yaml
 Qwen3-30B-A3B-NVFP4.yaml
 Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
 Qwen3-Next-FP8-EP2.yaml
--- a/tests/evals/gsm8k/test_gsm8k_correctness.py
+++ b/tests/evals/gsm8k/test_gsm8k_correctness.py
@ -71,6 +71,7 @@ def test_gsm8k_correctness(config_filename):
    print(f"Number of questions: {eval_config['num_questions']}")
    print(f"Number of few-shot examples: {eval_config['num_fewshot']}")
    print(f"Server args: {' '.join(server_args)}")
    print(f"Environment variables: {env_dict}")
    # Launch server and run evaluation
    with RemoteOpenAIServer(
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@ -40,93 +40,6 @@ KV_CACHE_DTYPE = ["auto", "fp8"]
 RESHAPE_FLASH_IMPLEMENTATIONS = ["cuda", "triton"]
@pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
 def test_copy_blocks(
    kv_cache_factory,
    num_mappings: int,
    num_layers: int,
    num_heads: int,
    head_size: int,
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
    seed: int,
    kv_cache_dtype: str,
    device: str,
 ) -> None:
    if kv_cache_dtype == "fp8" and head_size % 16:
        pytest.skip()
    current_platform.seed_everything(seed)
    torch.set_default_device(device)
    torch.cuda.set_device(device)
    # Generate random block mappings where each source block is mapped to two
    # destination blocks.
    assert 2 * num_mappings <= num_blocks
    src_blocks = random.sample(range(num_blocks), num_mappings)
    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
    dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
    block_mapping: list[tuple[int, int]] = []
    for i in range(num_mappings):
        src = src_blocks[i]
        dst1 = dst_blocks[2 * i]
        dst2 = dst_blocks[2 * i + 1]
        block_mapping.append((src, dst1))
        block_mapping.append((src, dst2))
    # Create the KV caches.
    key_caches, value_caches = kv_cache_factory(
        num_blocks,
        block_size,
        num_layers,
        num_heads,
        head_size,
        kv_cache_dtype,
        dtype,
        seed,
        device,
    )
    # Clone the KV caches.
    cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
    cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
    # Call the copy blocks kernel.
    block_mapping_tensor = torch.tensor(
        block_mapping, dtype=torch.int64, device=device
    ).view(-1, 2)
    opcheck(
        torch.ops._C_cache_ops.copy_blocks,
        (key_caches, value_caches, block_mapping_tensor),
        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
        cond=(head_size == HEAD_SIZES[0]),
    )
    ops.copy_blocks(key_caches, value_caches, block_mapping_tensor)
    # Run the reference implementation.
    for src, dst in block_mapping:
        for cloned_key_cache in cloned_key_caches:
            cloned_key_cache[dst].copy_(cloned_key_cache[src])
        for cloned_value_cache in cloned_value_caches:
            cloned_value_cache[dst].copy_(cloned_value_cache[src])
    # Compare the results.
    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
        torch.testing.assert_close(key_cache, cloned_key_cache)
    for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
        torch.testing.assert_close(value_cache, cloned_value_cache)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@ -763,73 +676,6 @@ def test_concat_and_cache_ds_mla(
        torch.testing.assert_close(kv_rope, ref_rope, atol=0.001, rtol=0.1)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
@pytest.mark.parametrize("num_layers", NUM_LAYERS)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
@torch.inference_mode()
 def test_copy_blocks_mla(
    kv_lora_rank: int,
    qk_rope_head_dim: int,
    block_size: int,
    num_blocks: int,
    num_layers: int,
    dtype: torch.dtype,
    seed: int,
    device: str,
    kv_cache_dtype: str,
 ) -> None:
    current_platform.seed_everything(seed)
    torch.set_default_device(device)
    torch.cuda.set_device(device)
    entry_size = kv_lora_rank + qk_rope_head_dim
    kv_caches = []
    for _ in range(num_layers):
        kv_cache = _create_mla_cache(
            num_blocks, block_size, entry_size, dtype, kv_cache_dtype, device
        )
        _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
        kv_caches.append(kv_cache)
    ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
    num_mappings = min(2, num_blocks // 2)
    src_blocks = random.sample(range(num_blocks), num_mappings)
    remaining = list(set(range(num_blocks)) - set(src_blocks))
    dst_blocks = random.sample(remaining, 2 * num_mappings)
    block_mapping = []
    for i in range(num_mappings):
        src = src_blocks[i]
        dst1 = dst_blocks[2 * i]
        dst2 = dst_blocks[2 * i + 1]
        block_mapping.append((src, dst1))
        block_mapping.append((src, dst2))
    block_mapping_tensor = torch.tensor(
        block_mapping, dtype=torch.int64, device=device
    ).view(-1, 2)
    for src, dst in block_mapping:
        for ref_cache in ref_caches:
            ref_cache[dst].copy_(ref_cache[src])
    opcheck(
        torch.ops._C_cache_ops.copy_blocks_mla,
        (kv_caches, block_mapping_tensor),
        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
    )
    ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
    for kv_cache, ref_cache in zip(kv_caches, ref_caches):
        torch.testing.assert_close(kv_cache, ref_cache)
@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
--- a/tests/models/multimodal/conftest.py
+++ b/tests/models/multimodal/conftest.py
@ -19,7 +19,7 @@ def pytest_collection_modifyitems(config, items):
        return
    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
-    # accuracy issues
+    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@ -513,6 +513,7 @@ VLM_TEST_SETTINGS = {
        max_model_len=8192,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        num_logprobs=10 if current_platform.is_rocm() else 5,
    ),
    "intern_vl-hf": VLMTestInfo(
        models=["OpenGVLab/InternVL3-1B-hf"],
--- a/tests/models/multimodal/generation/test_keye.py
+++ b/tests/models/multimodal/generation/test_keye.py
@ -8,7 +8,7 @@ from PIL.Image import Image
 from transformers import AutoProcessor
 from vllm import LLM, EngineArgs, SamplingParams
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
@ -31,10 +31,7 @@ def test_keye_vl(
    question: str,
 ):
    images = [asset.pil_image for asset in image_assets]
-
+    image_urls = [encode_image_url(image) for image in images]
    image_urls = [
        f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
    ]
    engine_args = EngineArgs(
        model=MODEL_NAME,
--- a/tests/models/multimodal/generation/test_vit_backend_functionality.py
+++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py
@ -15,7 +15,7 @@ from transformers import AutoProcessor
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.multimodal.video import sample_frames_from_video
 from vllm.platforms import current_platform
@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
    """Build Dots.OCR specific prompt with OCR instructions."""
    # Use only stop_sign image for Dots.OCR
    image = images[0]  # Already filtered to stop_sign
-
+    image_url = encode_image_url(image)
    image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
    placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
    messages = [
@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
        config["model_name"], trust_remote_code=True
    )
-    image_urls = [
+    image_urls = [encode_image_url(img) for img in images]
        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
    ]
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
 def build_ovis_prompt(images, config):
    """Build Ovis2.5 specific prompt with custom format."""
-    image_urls = [
+    image_urls = [encode_image_url(img) for img in images]
        f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
    ]
    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
    assert choice.message.content == "In the first audio clip, you hear a brief"
    assert choice.finish_reason == "length"
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -860,6 +860,11 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        # disable this temporarily until we support HF format
        is_available_online=False,
    ),
    "VoxtralStreamingGeneration": _HfExamplesInfo(
        "<place-holder>",
        # disable this temporarily until we support HF format
        is_available_online=False,
    ),
    # [Encoder-decoder]
    "WhisperForConditionalGeneration": _HfExamplesInfo(
        "openai/whisper-large-v3-turbo",
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@ -38,7 +38,7 @@ def test_inference(
        max_num_seqs=32,
        default_torch_num_threads=1,
    ) as vllm_model:
-        vllm_output = vllm_model.llm.encode(prompt)
+        vllm_output = vllm_model.llm.encode(prompt, pooling_task="plugin")
        assert torch.equal(
            torch.isnan(vllm_output[0].outputs.data).any(), torch.tensor(False)
        )
--- a/tests/standalone_tests/pytorch_nightly_dependency.sh
+++ b/tests/standalone_tests/pytorch_nightly_dependency.sh
@ -4,6 +4,11 @@
 set -e
 set -x
 if command -v rocminfo >/dev/null 2>&1; then
  echo "Skipping test for ROCm platform"
  exit 0
 fi
 cd /vllm-workspace/
 rm -rf .venv
@ -36,7 +41,7 @@ if diff before.txt after.txt; then
  echo "torch version not overridden."
 else
  echo "torch version overridden by nightly_torch_test.txt, \
-  if the dependency is not triggered by the pytroch nightly test,\
+  if the dependency is not triggered by the pytorch nightly test,\
  please add the dependency to the list 'white_list' in tools/pre_commit/generate_nightly_torch_test.py"
  exit 1
 fi
--- a/tests/tool_parsers/test_mistral_tool_parser.py
+++ b/tests/tool_parsers/test_mistral_tool_parser.py
@ -281,6 +281,8 @@ def test_extract_tool_calls_pre_v11_tokenizer(
        "single_tool_add",
        "single_tool_weather",
        "multiple_tool_calls",
        "complex",
        "wrong_json",
    ],
    argnames=["model_output", "expected_tool_calls", "expected_content"],
    argvalues=[
@ -326,6 +328,36 @@ def test_extract_tool_calls_pre_v11_tokenizer(
            ],
            None,
        ),
        (
            # Complex
            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')""",  # noqa: E501
            [
                ToolCall(
                    function=FunctionCall(
                        name="bash",
                        arguments=json.dumps(
                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
                        )[:-2],
                    )
                )
            ],
            "hi{hi",
        ),
        (
            # Wrong json
            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
            [
                ToolCall(
                    function=FunctionCall(
                        name="bash",
                        arguments=json.dumps(
                            {"command": "print(\"hello world!\")\nre.compile(r'{}')"}
                        ),
                    )
                )
            ],
            "hi{hi",
        ),
    ],
 )
 def test_extract_tool_calls(
@ -673,7 +705,7 @@ def test_extract_tool_calls_streaming(
        ),
        (
            # Complex
-            """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
+            """hi{hi[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""",  # noqa: E501
            [
                ToolCall(
                    function=FunctionCall(
@ -684,7 +716,7 @@ def test_extract_tool_calls_streaming(
                    )
                )
            ],
-            "",
+            "hi{hi",
        ),
    ],
 )
--- a/tests/utils.py
+++ b/tests/utils.py
@ -106,6 +106,7 @@ class RemoteOpenAIServer:
            env.update(env_dict)
        serve_cmd = ["vllm", "serve", model, *vllm_serve_args]
        print(f"Launching RemoteOpenAIServer with: {' '.join(serve_cmd)}")
        print(f"Environment variables: {env}")
        self.proc: subprocess.Popen = subprocess.Popen(
            serve_cmd,
            env=env,
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
        )
    )
    assert block_hashes[1] == expected_hash2
 def test_auto_fit_max_model_len():
    """Test that max_model_len=-1 auto-fits to available GPU memory."""
    # Create config with original_max_model_len=-1 to trigger auto-fit
    model_config = ModelConfig(max_model_len=1024)
    # Simulate the user passing -1 by setting original_max_model_len
    model_config.original_max_model_len = -1
    vllm_config = VllmConfig(model_config=model_config)
    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2  # 16KB per block per layer
    kv_cache_specs = {
        "layer_1": new_kv_cache_spec(),
        "layer_2": new_kv_cache_spec(),
    }
    # With enough memory, max_model_len stays at the derived max
    large_available_memory = mem_per_block_per_layer * 2 * 1024  # plenty of memory
    _kv_cache_configs = get_kv_cache_configs(
        vllm_config, [kv_cache_specs], [large_available_memory]
    )
    assert vllm_config.model_config.max_model_len == 1024
    # Reset for next test
    model_config = ModelConfig(max_model_len=1024)
    model_config.original_max_model_len = -1
    vllm_config = VllmConfig(model_config=model_config)
    # With limited memory, max_model_len should be reduced
    # Need memory for at least max_model_len tokens
    # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
    limited_memory = mem_per_block_per_layer * 2 * 32
    _kv_cache_configs = get_kv_cache_configs(
        vllm_config, [kv_cache_specs], [limited_memory]
    )
    # Should be reduced to fit in memory
    assert vllm_config.model_config.max_model_len < 1024
    assert vllm_config.model_config.max_model_len > 0
 def test_auto_fit_max_model_len_not_triggered():
    """Test that auto-fit is not triggered when original_max_model_len is not -1."""
    model_config = ModelConfig(max_model_len=16)
    # original_max_model_len should be None by default, not -1
    vllm_config = VllmConfig(model_config=model_config)
    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
    kv_cache_specs = {
        "layer_1": new_kv_cache_spec(),
        "layer_2": new_kv_cache_spec(),
    }
    # This should work normally without auto-fit
    _kv_cache_configs = get_kv_cache_configs(
        vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
    )
    assert vllm_config.model_config.max_model_len == 16
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@ -1356,6 +1356,69 @@ def test_kv_cache_events(blocks_to_cache: int):
    assert len(manager.block_pool.cached_block_hash_to_block) == 0
 def test_null_parent_block_hash():
    block_size = 1
    num_cached_blocks = 2
    num_full_blocks = 4
    pool = BlockPool(
        num_gpu_blocks=8,
        enable_caching=True,
        hash_block_size=block_size,
        enable_kv_cache_events=True,
    )
    req = make_request(
        "req_null_parent",
        prompt_token_ids=[10, 11, 12, 13],
        block_size=block_size,
        hash_fn=sha256,
    )
    assert len(req.block_hashes) == num_full_blocks
    # Physical parent is `null_block` (no hash), while the logical parent hash
    # still exists in `request.block_hashes[num_cached_blocks - 1]`.
    assert pool.null_block.block_hash is None
    new_blocks = pool.get_new_blocks(num_full_blocks - 1)
    blocks = [
        new_blocks[: num_cached_blocks - 1],
        pool.null_block,  # physical parent
        *new_blocks[num_cached_blocks - 1 :],
    ]
    pool.cache_full_blocks(
        request=req,
        blocks=blocks,
        num_cached_blocks=num_cached_blocks,
        num_full_blocks=num_full_blocks,
        block_size=block_size,
        kv_cache_group_id=0,
    )
    events = pool.take_events()
    assert len(events) == 1
    event = events[0]
    assert isinstance(event, BlockStored)
    expected_parent = kv_cache_utils.maybe_convert_block_hash(
        req.block_hashes[num_cached_blocks - 1]
    )
    assert event.parent_block_hash == expected_parent
    assert event.parent_block_hash is not None
    expected_new_hashes = [
        kv_cache_utils.maybe_convert_block_hash(h)
        for h in req.block_hashes[num_cached_blocks:num_full_blocks]
    ]
    assert event.block_hashes == expected_new_hashes
    # Ensure we didn't accidentally assign a hash to the null block.
    assert pool.null_block.block_hash is None
    # Sanity check: newly cached physical blocks should have hashes assigned.
    assert blocks[num_cached_blocks].block_hash is not None
    assert blocks[num_full_blocks - 1].block_hash is not None
@pytest.mark.parametrize("blocks_to_cache", [2, 3, 10])
 def test_kv_cache_events_with_lora(blocks_to_cache: int):
    """Test BlockStored events contain correct lora_id when using LoRA requests."""
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@ -31,7 +31,7 @@ import openai
 import requests
 from vllm.assets.image import ImageAsset
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 MAX_OUTPUT_LEN = 256
@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                "content": [
                    {
                        "type": "image_url",
-                        "image_url": {
+                        "image_url": {"url": encode_image_url(image_1)},
                            "url": f"data:image;base64,{encode_image_base64(image_1)}"
                        },
                    },
                    {"type": "text", "text": "What's in this image?"},
                ],
@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                "content": [
                    {
                        "type": "image_url",
-                        "image_url": {
+                        "image_url": {"url": encode_image_url(image_2)},
                            "url": f"data:image;base64,{encode_image_base64(image_2)}"
                        },
                    },
                    {
                        "type": "image_url",
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@ -260,7 +260,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
        # Use multi-abort to abort multiple requests at once
        abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
-        await engine.abort(abort_request_ids)
+        await engine.abort(abort_request_ids, internal=False)
        # Wait for all tasks to complete
        results = await asyncio.gather(*tasks, return_exceptions=True)
@ -609,7 +609,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
        await asyncio.sleep(0.5)
        # Abort the request
-        await engine.abort(request_id)
+        await engine.abort(request_id, internal=False)
        # Wait for generation to complete and return final output
        final_output = await generated
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@ -40,10 +40,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 _REQUEST_COUNTER = 0
 def make_request() -> EngineCoreRequest:
    global _REQUEST_COUNTER
    _REQUEST_COUNTER += 1
    request_id = f"request-{_REQUEST_COUNTER}"
    return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
        external_req_id=f"{request_id}-{uuid.uuid4()}",
        prompt_token_ids=PROMPT_TOKENS,
        mm_features=None,
        sampling_params=SamplingParams(),
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@ -45,6 +45,8 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "Hello my name is Robert and I love quantization kernels"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
 _REQUEST_COUNTER = 0
 def make_request(
    params: SamplingParams, prompt_tokens_ids: list[int] | None = None
@ -52,8 +54,12 @@ def make_request(
    if not prompt_tokens_ids:
        prompt_tokens_ids = PROMPT_TOKENS
    global _REQUEST_COUNTER
    _REQUEST_COUNTER += 1
    request_id = f"request-{_REQUEST_COUNTER}"
    return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
        external_req_id=f"{request_id}-{uuid.uuid4()}",
        prompt_token_ids=prompt_tokens_ids,
        mm_features=None,
        sampling_params=params,
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@ -27,6 +27,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
    params = SamplingParams(skip_special_tokens=True)
    request = EngineCoreRequest(
        request_id="test",
        external_req_id="test-ext",
        prompt_token_ids=prompt_token_ids,
        mm_features=None,
        sampling_params=params,
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@ -58,12 +58,12 @@ def test_incremental_detokenization(
    output_processor = OutputProcessor(
        dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
    )
    engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
    # Make N requests.
    requests = [
        EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
            external_req_id=f"request-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -83,6 +83,11 @@ def test_incremental_detokenization(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]
    engine_core = MockEngineCore(
        tokens_list=dummy_test_vectors.generation_tokens,
        request_ids=[req.request_id for req in requests],
    )
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@ -438,15 +443,6 @@ def test_logprobs_processor(
    dummy_test_vectors,
 ):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
    engine_core = MockEngineCore(
        tokens_list=dummy_test_vectors.generation_tokens,
        generated_logprobs_raw=None
        if num_sample_logprobs is None
        else dummy_test_vectors.generation_logprobs,
        prompt_logprobs_raw=None
        if num_prompt_logprobs is None
        else dummy_test_vectors.prompt_logprobs,
    )
    # Make N requests.
    request_id_list = [
@ -454,7 +450,8 @@ def test_logprobs_processor(
    ]
    requests = [
        EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
            external_req_id=request_id_list[idx],
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -476,6 +473,17 @@ def test_logprobs_processor(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]
    engine_core = MockEngineCore(
        tokens_list=dummy_test_vectors.generation_tokens,
        generated_logprobs_raw=None
        if num_sample_logprobs is None
        else dummy_test_vectors.generation_logprobs,
        prompt_logprobs_raw=None
        if num_prompt_logprobs is None
        else dummy_test_vectors.prompt_logprobs,
        request_ids=[req.request_id for req in requests],
    )
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@ -621,19 +629,12 @@ def test_stop_token(
        ]
    prompt_string = dummy_test_vectors.prompt_strings[0]
    prompt_tokens = dummy_test_vectors.prompt_tokens[0]
    engine_core = MockEngineCore(
        tokens_list=[generation_tokens],
        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
        prompt_logprobs_raw=None,
        eos_token_id=eos_token_id,
        stop_token_ids=stop_token_ids,
        ignore_eos=ignore_eos,
    )
    # Make request.
    request_id = "request-0"
    request = EngineCoreRequest(
        request_id=request_id,
        external_req_id=request_id + "-ext",
        prompt_token_ids=prompt_tokens,
        mm_features=None,
        eos_token_id=eos_token_id,
@ -655,6 +656,16 @@ def test_stop_token(
        pooling_params=None,
    )
    engine_core = MockEngineCore(
        tokens_list=[generation_tokens],
        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
        prompt_logprobs_raw=None,
        eos_token_id=eos_token_id,
        stop_token_ids=stop_token_ids,
        ignore_eos=ignore_eos,
        request_ids=[request.request_id],
    )
    # Add request to the detokenizer.
    output_processor.add_request(request, prompt_string)
@ -720,13 +731,6 @@ def test_stop_string(
    dummy_test_vectors,
 ):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
    engine_core = MockEngineCore(
        tokens_list=dummy_test_vectors.generation_tokens,
        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
        if num_sample_logprobs
        else None,
        prompt_logprobs_raw=None,
    )
    # Make N requests.
    request_id_list = [
@ -734,7 +738,8 @@ def test_stop_string(
    ]
    requests = [
        EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
            external_req_id=request_id_list[idx],
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -756,6 +761,15 @@ def test_stop_string(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]
    engine_core = MockEngineCore(
        tokens_list=dummy_test_vectors.generation_tokens,
        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
        if num_sample_logprobs
        else None,
        prompt_logprobs_raw=None,
        request_ids=[req.request_id for req in requests],
    )
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@ -813,9 +827,12 @@ def test_stop_string(
    for idx, (ref_gen_str, stop_str) in enumerate(
        zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
    ):
-        # Request should be aborted.
+        # Request should be aborted (check internal ID in abort list).
        internal_request_id = f"request-{idx}-int"
        assert internal_request_id in aborted
        # Use external ID for collecting outputs
        request_id = f"request-{idx}"
        assert request_id in aborted
        # Collected values that were generated.
        gen_str = gen_strings[request_id]
@ -848,13 +865,13 @@ def test_stop_string(
 def test_iteration_stats(dummy_test_vectors):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
    engine_core_timestamp = time.monotonic()
    # Make N requests.
    requests = [
        EngineCoreRequest(
            request_id=f"request-{idx}",
            external_req_id=f"request-{idx}-ext",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -868,6 +885,11 @@ def test_iteration_stats(dummy_test_vectors):
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]
    engine_core = MockEngineCore(
        dummy_test_vectors.generation_tokens,
        request_ids=[req.request_id for req in requests],
    )
    # Add all requests except one to the OutputProcessor.
    num_active = len(dummy_test_vectors.generation_tokens) - 1
    for request in requests[:num_active]:
@ -922,7 +944,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    output_processor = OutputProcessor(
        dummy_test_vectors.tokenizer, log_stats=log_stats
    )
    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
    engine_core_timestamp = time.monotonic()
    # Create LoRA requests
@ -936,7 +957,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    lora_assignments = [lora1, lora2, None]
    requests = [
        EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
            external_req_id=f"request-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -950,6 +972,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]
    engine_core = MockEngineCore(
        dummy_test_vectors.generation_tokens,
        request_ids=[req.request_id for req in requests],
    )
    # Add all requests to the OutputProcessor
    for request in requests:
        output_processor.add_request(request, None)
@ -1015,9 +1042,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-0 as finished (it uses lora-1)
+    # Find and mark request-0-int as finished (it uses lora-1)
    for output in outputs.outputs:
-        if output.request_id == "request-0":
+        if output.request_id == "request-0-int":
            output.finish_reason = FinishReason.LENGTH
            break
@ -1040,9 +1067,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-1 as finished (it uses lora-2)
+    # Find and mark request-1-int as finished (it uses lora-2)
    for output in outputs.outputs:
-        if output.request_id == "request-1":
+        if output.request_id == "request-1-int":
            output.finish_reason = FinishReason.LENGTH
            break
@ -1064,9 +1091,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-2 as finished (it has no LoRA)
+    # Find and mark request-2-int as finished (it has no LoRA)
    for output in outputs.outputs:
-        if output.request_id == "request-2":
+        if output.request_id == "request-2-int":
            output.finish_reason = FinishReason.LENGTH
            break
@ -1107,7 +1134,9 @@ async def test_request_output_collector():
            for idx in range(NUM_REQS)
        ]
-    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+    collector = RequestOutputCollector(
        RequestOutputKind.DELTA, request_id="my-request-id-int"
    )
    # CASE 1: Put then get.
    outputs = make_outputs()
@ -1163,7 +1192,9 @@ async def test_request_output_collector():
@pytest.mark.asyncio
 async def test_cumulative_output_collector_n():
    """Test collector correctly handles multiple outputs by index."""
-    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    collector = RequestOutputCollector(
        RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
    )
    outputs = [
        RequestOutput(
            request_id="my-request-id",
@ -1242,11 +1273,13 @@ async def test_cumulative_output_collector_n():
@pytest.mark.parametrize("runner", ["generate", "pooling"])
-def test_abort_requests(runner: str, dummy_test_vectors):
+@pytest.mark.parametrize("abort_by", ["internal", "external"])
 def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
    requests = [
        EngineCoreRequest(
            request_id=f"request-{idx}",
            external_req_id=f"external-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@ -1265,8 +1298,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
            output_kind = request.sampling_params.output_kind
        else:
            output_kind = request.pooling_params.output_kind
-        queue = RequestOutputCollector(output_kind=output_kind)
+        queue = RequestOutputCollector(
            output_kind=output_kind, request_id=request.request_id
        )
        output_processor.add_request(request, None, queue=queue)
    for request in requests:
-        output_processor.abort_requests([request.request_id])
+        if abort_by == "internal":
            output_processor.abort_requests([request.request_id], internal=True)
        else:
            output_processor.abort_requests([request.external_req_id], internal=False)
--- a/tests/v1/engine/test_parallel_sampling.py
+++ b/tests/v1/engine/test_parallel_sampling.py
@ -4,11 +4,12 @@
 from vllm import SamplingParams
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.parallel_sampling import ParentRequest
 def test_parent_request_to_output_stream() -> None:
-    parent_request = ParentRequest("parent_id", SamplingParams(n=2))
+    parent_request = ParentRequest(make_request(SamplingParams(n=2)))
    parent_request.child_requests = {"child_id_0", "child_id_1"}
    output_0 = CompletionOutput(
        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    # Request not finished
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
-        "child_id_0", output_0
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
        "child_id_1", output_1
    )
    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
        "child_id_0", output_0
    )
    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
        "child_id_1", output_1
    )
    # output_1 finished
    output_1.finish_reason = "ended"
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
-        "child_id_0", output_0
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
    )
    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
        "child_id_1", output_1
    )
    # Finished output_1 had already returned, DO NOT returned again
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
-        "child_id_0", output_0
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
    )
    assert parent_request.get_outputs("child_id_1", output_1) == (
        "parent_id",
        [],
        False,
    )
    # output_0 finished
    output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0], True) == parent_request.get_outputs(
+    assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
-        "child_id_0", output_0
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
    )
    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
    # Finished output_0 had already returned, DO NOT returned again
-    assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
 def test_parent_request_to_output_final_only() -> None:
    parent_request = ParentRequest(
-        "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
+        make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
    )
    parent_request.child_requests = {"child_id_0", "child_id_1"}
    output_0 = CompletionOutput(
@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    # Request not finished, return nothing
-    assert parent_request.get_outputs("child_id_0", output_0) == (
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
-        "parent_id",
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
        [],
        False,
    )
    assert parent_request.get_outputs("child_id_1", output_1) == (
        "parent_id",
        [],
        False,
    )
    # output_1 finished, but outputs won't be returned until all child requests finished
    output_1.finish_reason = "ended"
-    assert parent_request.get_outputs("child_id_0", output_0) == (
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
-        "parent_id",
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
        [],
        False,
    )
    assert parent_request.get_outputs("child_id_1", output_1) == (
        "parent_id",
        [],
        False,
    )
    # output_0 finished, as all child requests finished, the output would be returned
    output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
        "child_id_0", output_0
    )
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
        "child_id_1", output_1
    )
 def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
    return EngineCoreRequest(
        request_id="parent_id",
        external_req_id="ext_parent_id",
        prompt_token_ids=None,
        mm_features=None,
        sampling_params=sampling_params,
        pooling_params=None,
        eos_token_id=None,
        arrival_time=0.0,
        lora_request=None,
        cache_salt=None,
        data_parallel_rank=None,
    )
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@ -6,6 +6,7 @@ import pytest
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
 from vllm.multimodal import MultiModalUUIDDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
        monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
    )
-    captured: dict[str, object] = {}
+    captured: dict[str, MultiModalUUIDDict] = {}
    def fake_preprocess(
        prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
    )
    # Expect request-id-based overrides are passed through
-    assert captured["mm_uuids"] == {
+    mm_uuids = captured["mm_uuids"]
-        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
+    assert set(mm_uuids.keys()) == {"image", "video"}
-        "video": [f"{request_id}-video-0"],
+    assert len(mm_uuids["image"]) == 2
-    }
+    assert len(mm_uuids["video"]) == 1
    assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
        "image"
    ][0].endswith("-0")
    assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
        "image"
    ][1].endswith("-1")
    assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
        "video"
    ][0].endswith("-0")
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@ -343,6 +343,7 @@ class MockEngineCore:
        eos_token_id: int | None = None,
        stop_token_ids: list[int] | None = None,
        ignore_eos: bool = False,
        request_ids: list[str] | None = None,
    ) -> None:
        self.num_requests = len(tokens_list)
        self.tokens_list = tokens_list
@ -355,6 +356,11 @@ class MockEngineCore:
        self.eos_token_id = eos_token_id
        self.stop_token_ids = stop_token_ids
        self.ignore_eos = ignore_eos
        self.request_ids = (
            request_ids
            if request_ids is not None
            else [f"request-{i}" for i in range(self.num_requests)]
        )
    def get_outputs(self) -> list[EngineCoreOutput]:
        do_logprobs = self.do_logprobs
@ -386,7 +392,7 @@ class MockEngineCore:
                    prompt_logprobs = None
                new_token_id = token_ids[token_idx]
                output = EngineCoreOutput(
-                    request_id=f"request-{req_idx}",
+                    request_id=self.request_ids[req_idx],
                    new_token_ids=[new_token_id],
                    new_logprobs=logprobs,
                    new_prompt_logprobs_tensors=prompt_logprobs,
--- a/tests/v1/entrypoints/openai/serving_responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio
 from tests.utils import RemoteOpenAIServer
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 # Use a small vision model for testing
 MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
@ -52,9 +52,9 @@ async def client(image_server):
@pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
    return {
-        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
+        image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
        for image_url in TEST_IMAGE_ASSETS
    }
@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
    raw_image_url: str,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
    content_text = "What's in this image?"
    messages = [
@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
            "content": [
                {
                    "type": "input_image",
-                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",  # noqa: E501
+                    "image_url": url_encoded_image[raw_image_url],
                    "detail": "auto",
                },
                {"type": "input_text", "text": content_text},
--- a/tests/v1/kv_connector/unit/test_example_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@ -9,7 +9,7 @@ from PIL import Image
 from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import KVTransferConfig
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.platforms import current_platform
 MODEL_NAME = "RedHatAI/Qwen2.5-VL-3B-Instruct-quantized.w8a8"
@ -74,7 +74,7 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]):
    placeholders = [
        {
            "type": "image_url",
-            "image_url": {"url": f"data:image;base64,{encode_image_base64(image_pil)}"},
+            "image_url": {"url": encode_image_url(image_pil)},
        }
        for image_pil in image_urls
    ]
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@ -41,10 +41,13 @@ from vllm.distributed.kv_transfer.kv_transfer_state import (
    has_kv_transfer_group,
 )
 from vllm.forward_context import ForwardContext
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
 from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.outputs import KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import RequestStatus
@ -1265,6 +1268,22 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend):
        run_test_and_cleanup()
 class RequestIdMapper:
    """Helper class to map external request IDs to internal request IDs."""
    def __init__(self, output_processor: OutputProcessor):
        self.req_id_mapping: dict[str, str] = {}
        self.original_add_request = output_processor.add_request
        output_processor.add_request = self._add_request
    def _add_request(self, request: EngineCoreRequest, *args, **kwargs):
        self.req_id_mapping[request.external_req_id] = request.request_id
        return self.original_add_request(request, *args, **kwargs)
    def __call__(self, external_req_id: str) -> str:
        return self.req_id_mapping[external_req_id]
 def _run_abort_timeout_test(llm: LLM, timeout: int):
    """Helper function to run the abort timeout test logic."""
    remote_prefill_opts = {
@ -1286,24 +1305,34 @@ def _run_abort_timeout_test(llm: LLM, timeout: int):
        0
    ].req_to_blocks
    id_mapper = RequestIdMapper(llm.llm_engine.output_processor)
    def req_id(outputs: list[RequestOutput]) -> str:
        assert len(outputs) == 1
        return id_mapper(outputs[0].request_id)
    padding = "Just making this request a little longer so that we're sure "
    "we're not hitting the small-request lower bound beneath which we don't "
    "actually trigger the whole kv transfer, but rather just recompute the "
    "blocks on D."
-    _ = llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
+    req0_id = req_id(
        llm.generate([f"What is the capital of Japan? {padding}"], sampling_params)
    )
    # Request finished but not freed
-    assert "0" in scheduler.finished_req_ids and "0" in req_to_blocks
+    assert req0_id in scheduler.finished_req_ids and req0_id in req_to_blocks
    # Some other request, 0 still not freed
-    _ = llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
+    req1_id = req_id(
-    assert "0" in req_to_blocks
+        llm.generate([f"What is the capital of Italy? {padding}"], sampling_params)
-    assert "1" in scheduler.finished_req_ids and "1" in req_to_blocks
+    )
    assert req0_id in req_to_blocks
    assert req1_id in scheduler.finished_req_ids and req1_id in req_to_blocks
    # Wait for timeout and trigger another scheduler loop
    time.sleep(timeout)
    _ = llm.generate([f"What is the capital of France? {padding}"], sampling_params)
    # Request-0 times out and is cleared!
-    assert "0" not in req_to_blocks
+    assert req0_id not in req_to_blocks
    # Need to shutdown the background thread to release NIXL side channel port
    llm.llm_engine.engine_core.shutdown()
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@ -4,7 +4,7 @@
 import openai
 import pytest
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url
 from vllm.platforms import current_platform
 from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
@ -12,11 +12,9 @@ from ...utils import RemoteOpenAIServer
@pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
    return {
-        image_asset: encode_image_base64(
+        image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
            local_asset_server.get_image_asset(image_asset)
        )
        for image_asset in TEST_IMAGE_ASSETS
    }
@ -24,19 +22,16 @@ def base64_encoded_image(local_asset_server) -> dict[str, str]:
@pytest.mark.asyncio
@pytest.mark.skipif(not current_platform.is_tpu(), reason="This test needs a TPU")
@pytest.mark.parametrize("model_name", ["llava-hf/llava-1.5-7b-hf"])
-async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str]):
+async def test_basic_vision(model_name: str, url_encoded_image: dict[str, str]):
    pytest.skip("Skip this test until it's fixed.")
-    def whats_in_this_image_msg(b64):
+    def whats_in_this_image_msg(url):
        return [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
-                    {
+                    {"type": "image_url", "image_url": {"url": url}},
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{b64}"},
                    },
                ],
            }
        ]
@ -63,14 +58,14 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str, str
        # Other requests now should be much faster
        for image_url in TEST_IMAGE_ASSETS:
-            image_base64 = base64_encoded_image[image_url]
+            image_url = url_encoded_image[image_url]
-            chat_completion_from_base64 = await client.chat.completions.create(
+            chat_completion_from_url = await client.chat.completions.create(
                model=model_name,
-                messages=whats_in_this_image_msg(image_base64),
+                messages=whats_in_this_image_msg(image_url),
                max_completion_tokens=24,
                temperature=0.0,
            )
-            result = chat_completion_from_base64
+            result = chat_completion_from_url
            assert result
            choice = result.choices[0]
            assert choice.finish_reason == "length"
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@ -380,6 +380,31 @@ def _rocm_aiter_gemm_a8w8_fake(
    return Y
 def _rocm_aiter_triton_gemm_a8w8_blockscale_impl(
    A: torch.Tensor,
    B: torch.Tensor,
    As: torch.Tensor,
    Bs: torch.Tensor,
    output_dtype: torch.dtype = torch.float16,
 ) -> torch.Tensor:
    from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
    return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
 def _rocm_aiter_triton_gemm_a8w8_blockscale_fake(
    A: torch.Tensor,
    B: torch.Tensor,
    As: torch.Tensor,
    Bs: torch.Tensor,
    output_dtype: torch.dtype = torch.float16,
 ) -> torch.Tensor:
    m = A.shape[0]
    n = B.shape[0]
    Y = torch.empty(m, n, dtype=output_dtype, device=A.device)
    return Y
 def _rocm_aiter_gemm_a8w8_blockscale_impl(
    A: torch.Tensor,
    B: torch.Tensor,
@ -964,6 +989,12 @@ class rocm_aiter_ops:
                dispatch_key=current_platform.dispatch_key,
            )
            direct_register_custom_op(
                op_name="rocm_aiter_triton_gemm_a8w8_blockscale",
                op_func=_rocm_aiter_triton_gemm_a8w8_blockscale_impl,
                fake_impl=_rocm_aiter_triton_gemm_a8w8_blockscale_fake,
            )
            direct_register_custom_op(
                op_name="rocm_aiter_gemm_a8w8_blockscale",
                op_func=_rocm_aiter_gemm_a8w8_blockscale_impl,
@ -1102,6 +1133,19 @@ class rocm_aiter_ops:
    ) -> torch.Tensor:
        return torch.ops.vllm.rocm_aiter_gemm_a8w8(A, B, As, Bs, bias, output_dtype)
    @staticmethod
    def triton_gemm_a8w8_blockscale(
        A: torch.Tensor,
        B: torch.Tensor,
        As: torch.Tensor,
        Bs: torch.Tensor,
        block_size: list[int],
        output_dtype: torch.dtype = torch.float16,
    ) -> torch.Tensor:
        return torch.ops.vllm.rocm_aiter_triton_gemm_a8w8_blockscale(
            A, B, As, Bs, output_dtype
        )
    @staticmethod
    def gemm_a8w8_blockscale(
        A: torch.Tensor,
@ -1373,19 +1417,6 @@ class rocm_aiter_ops:
            config=config,
        )
    @staticmethod
    def triton_gemm_a8w8_blockscale(
        A: torch.Tensor,
        B: torch.Tensor,
        As: torch.Tensor,
        Bs: torch.Tensor,
        block_size: list[int],
        output_dtype: torch.dtype = torch.float16,
    ) -> torch.Tensor:
        from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
        return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
    @staticmethod
    def group_fp8_quant(
        input_2d: torch.Tensor,
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -2328,18 +2328,6 @@ def concat_and_cache_mla(
    )
 def copy_blocks(
    key_caches: list[torch.Tensor],
    value_caches: list[torch.Tensor],
    block_mapping: torch.Tensor,
 ) -> None:
    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 def copy_blocks_mla(kv_caches: list[torch.Tensor], block_mapping: torch.Tensor) -> None:
    torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
 def swap_blocks(
    src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
 ) -> None:
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@ -383,18 +383,6 @@ class ipex_ops:
        )
        return None
    @staticmethod
    def copy_blocks(
        key_caches: list[torch.Tensor],
        value_caches: list[torch.Tensor],
        block_mapping: torch.Tensor,
    ) -> None:
        torch.xpu.copy_blocks(  # type: ignore
            key_caches,
            value_caches,
            block_mapping,
        )
    @staticmethod
    def swap_blocks(
        src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor
--- a/vllm/attention/layers/mm_encoder_attention.py
+++ b/vllm/attention/layers/mm_encoder_attention.py
@ -136,7 +136,7 @@ class MMEncoderAttention(CustomOp):
            cu_seqlens=cu_seqlens,
        )
        if is_reshaped:
-            output = output.view(bsz, q_len, -1)
+            output = output.reshape(bsz, q_len, -1)
        return output
    def _forward_fa(
@ -174,7 +174,7 @@ class MMEncoderAttention(CustomOp):
            fa_version=self._fa_version,
        )
        if is_reshaped:
-            output = output.view(bsz, q_len, -1)
+            output = output.reshape(bsz, q_len, -1)
        return output
    def forward_native(
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@ -1847,7 +1847,6 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
                random_seed=args.seed,
                dataset_path=args.dataset_path,
                disable_shuffle=args.disable_shuffle,
                prefix_len=args.common_prefix_len,
            ).sample(
                tokenizer=tokenizer,
                num_requests=args.num_prompts,
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@ -1281,12 +1281,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
        help="Repetition penalty sampling parameter. Only has effect on "
        "openai-compatible backends.",
    )
    sampling_group.add_argument(
        "--common-prefix-len",
        type=int,
        default=None,
        help="Common prefix length shared by all prompts (used by random dataset)",
    )
    parser.add_argument(
        "--served-model-name",
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -172,7 +172,10 @@ class ModelConfig:
    format. Examples:\n
    - 1k -> 1000\n
    - 1K -> 1024\n
-    - 25.6k -> 25,600"""
+    - 25.6k -> 25,600\n
    - -1 or 'auto' -> Automatically choose the maximum model length that fits in
    GPU memory. This will use the model's maximum context length if it fits,
    otherwise it will find the largest length that can be accommodated."""
    spec_target_max_model_len: int | None = None
    """Specify the maximum length for spec decoding draft models."""
    quantization: QuantizationMethods | str | None = None
@ -1094,11 +1097,10 @@ class ModelConfig:
        # The size of inputs_embeds is usually identical to the size
        # of the hidden states, however there are exceptions, such as
        # embedding models like CLIP and SigLIP
-        for target_attr in ("projection_dim", "projection_size"):
+        names = ("projection_dim", "projection_size")
-            if hasattr(self.hf_text_config, target_attr):
+        return getattr_iter(
-                return getattr(self.hf_text_config, target_attr)
+            self.hf_text_config, names, default_factory=self.get_hidden_size
-
+        )
        return self.get_hidden_size()
    @property
    def is_deepseek_mla(self) -> bool:
@ -1231,14 +1233,12 @@ class ModelConfig:
            # For ChatGLM:
            "multi_query_group_num",
        ]
        for attr in attributes:
            num_kv_heads = getattr(self.hf_text_config, attr, None)
            if num_kv_heads is not None:
                return num_kv_heads
        # For non-grouped-query attention models, the number of KV heads is
        # equal to the number of attention heads.
-        return self.hf_text_config.num_attention_heads
+        default_factory = lambda: self.hf_text_config.num_attention_heads
        return getattr_iter(
            self.hf_text_config, attributes, default_factory=default_factory
        )
    def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int:
        """Returns the number of KV heads per GPU."""
@ -1542,6 +1542,10 @@ class ModelConfig:
    def is_multimodal_raw_input_only_model(self) -> bool:
        return self._model_info.supports_multimodal_raw_input_only
    @property
    def requires_raw_input_tokens(self) -> bool:
        return self._model_info.requires_raw_input_tokens
    @property
    def is_cross_encoder(self) -> bool:
        return (
@ -2150,9 +2154,10 @@ def _get_and_verify_max_len(
    if encoder_config and "max_seq_length" in encoder_config:
        derived_max_model_len = encoder_config["max_seq_length"]
-    # If the user didn't specify `max_model_len`, then use that derived from
+    # If the user didn't specify `max_model_len` or specified -1 (auto-fit),
-    # the model config as a default value.
+    # then use that derived from the model config as a default value.
-    if max_model_len is None:
+    # When -1 is specified, the engine will later auto-fit to available memory.
    if max_model_len is None or max_model_len == -1:
        # For LongRoPE, default to original_max_position_embeddings to avoid
        # performance degradation for shorter sequences
        if rope_parameters is not None and any(
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -465,6 +465,7 @@ class ParallelConfig:
            # Derived/runtime topology, networking, or launch details
            "data_parallel_rank",
            "data_parallel_rank_local",
            "data_parallel_size_local",
            "data_parallel_backend",
            "data_parallel_external_lb",
            "data_parallel_hybrid_lb",
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@ -9,7 +9,7 @@ import inspect
 import json
 import pathlib
 import textwrap
-from collections.abc import Iterable, Mapping, Sequence, Set
+from collections.abc import Callable, Iterable, Mapping, Sequence, Set
 from dataclasses import MISSING, Field, dataclass, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
@ -74,7 +74,11 @@ def get_field(cls: ConfigType, name: str) -> Field:
 def getattr_iter(
-    object: object, names: Iterable[str], default: Any, warn: bool = False
+    object: object,
    names: Iterable[str],
    default: Any | None = None,
    default_factory: Callable[[], Any] | None = None,
    warn: bool = False,
 ) -> Any:
    """
    A helper function that retrieves an attribute from an object which may
@ -96,7 +100,7 @@ def getattr_iter(
                    names[0],
                )
            return getattr(object, name)
-    return default
+    return default_factory() if default_factory is not None else default
 def contains_object_print(text: str) -> bool:
--- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py
@ -408,7 +408,13 @@ class MooncakeConnectorWorker:
        self.engine = TransferEngine()
        self.hostname = get_ip()
-        ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", "rdma", "")
+        protocol = self.vllm_config.kv_transfer_config.kv_connector_extra_config.get(  # type: ignore[union-attr]
            "mooncake_protocol", "rdma"
        )
        logger.info(
            "The Mooncake Transfer Engine is using %s as its protocol.", protocol
        )
        ret_value = self.engine.initialize(self.hostname, "P2PHANDSHAKE", protocol, "")
        if ret_value != 0:
            raise RuntimeError("Mooncake Transfer Engine initialization failed.")
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -2045,13 +2045,20 @@ def _raise_unsupported_error(feature_name: str):
 def human_readable_int(value):
    """Parse human-readable integers like '1k', '2M', etc.
    Including decimal values with decimal multipliers.
    Also accepts -1 or 'auto' as a special value for auto-detection.
    Examples:
    - '1k' -> 1,000
    - '1K' -> 1,024
    - '25.6k' -> 25,600
    - '-1' or 'auto' -> -1 (special value for auto-detection)
    """
    value = value.strip()
    # Handle -1 or 'auto' as a special value for auto-detection
    if value == "-1" or value.lower() == "auto":
        return -1
    match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
    if match:
        decimal_multiplier = {
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -1621,7 +1621,7 @@ class LLM:
                added_request_ids.append(request_id)
        except Exception as e:
            if added_request_ids:
-                self.llm_engine.abort_request(added_request_ids)
+                self.llm_engine.abort_request(added_request_ids, internal=True)
            raise e
    def _validate_mm_data_and_uuids(
@ -1731,7 +1731,7 @@ class LLM:
            priority=priority,
            prompt_text=prompt_text,
        )
-        return request_id
+        return engine_request.request_id
    def _run_engine(
        self, *, use_tqdm: bool | Callable[..., tqdm] = True
--- a/vllm/entrypoints/pooling/embed/conftest.py
+++ b/vllm/entrypoints/pooling/embed/conftest.py
@ -0,0 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Pytest configuration for vLLM pooling embed tests."""
 import warnings
 import torch
 from vllm.platforms import current_platform
 def pytest_collection_modifyitems(config, items):
    """Configure ROCm-specific settings based on collected tests."""
    if not current_platform.is_rocm():
        return
    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
    torch.backends.cuda.enable_math_sdp(True)
    warnings.warn(
        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
        "to avoid HuggingFace Transformers accuracy issues",
        UserWarning,
        stacklevel=1,
    )
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@ -118,6 +118,7 @@ class ShortConv(MambaBase, CustomOp):
            conv_state = self_kv_cache[0].transpose(-1, -2)
            state_indices_tensor = attn_metadata.state_indices_tensor
            has_initial_states_p = attn_metadata.has_initial_states_p
            query_start_loc_p = attn_metadata.query_start_loc_p
        BCx, _ = self.in_proj(hidden_states)
@ -165,11 +166,6 @@ class ShortConv(MambaBase, CustomOp):
            [num_decodes, num_prefills],
            dim=0,
        )
        query_start_loc_p = (
            attn_metadata.query_start_loc[-num_prefills - 1 :] - num_decodes
            if has_prefill
            else None
        )
        conv_output_list = []
--- a/vllm/model_executor/models/audioflamingo3.py
+++ b/vllm/model_executor/models/audioflamingo3.py
@ -111,7 +111,7 @@ class AudioFlamingo3EmbeddingInputs(TensorSchema):
    audio_embeds: Annotated[
        list[torch.Tensor],
-        TensorShape("bn", "naf", "hs"),
+        TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
    ]
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@ -878,11 +878,14 @@ class Indexer(nn.Module):
        )
        q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
-        # `rotary_emb` is shape-preserving; `q_pe` is already
+        # Note: RoPE (NeoX) can introduce extra leading dimensions during compilation
-        # [num_tokens, n_head, rope_dim].
+        # so we need to reshape back to token-flattened shapes
        q_pe = q_pe.reshape(-1, self.n_head, self.rope_dim)
        k_pe = k_pe.reshape(-1, 1, self.rope_dim)
        q = torch.cat([q_pe, q_nope], dim=-1)
        # `k_pe` is [num_tokens, 1, rope_dim] (MQA).
-        k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1)
+        k = torch.cat([k_pe.squeeze(-2), k_nope], dim=-1)
        # we only quant q here since k quant is fused with cache insertion
        q = q.view(-1, self.head_dim)
@ -1595,7 +1598,11 @@ class DeepseekV2ForCausalLM(
                    # Determine split axis based on op type
                    # gate/up: ColumnParallel → split along dim 0
                    # down: RowParallel → split along dim 1
-                    split_dim = 1 if "down_proj.weight" in name else 0
+                    split_dim = (
                        1
                        if ("down_proj.weight" in name and loaded_weight.ndim > 1)
                        else 0
                    )
                    total = loaded_weight.shape[split_dim]
                    assert total % num_chunks == 0, (
                        f"Shared expert weight dim {total} "
@ -1608,14 +1615,13 @@ class DeepseekV2ForCausalLM(
                    weight_to_load = loaded_weight
                    if is_fusion_moe_shared_experts_layer:
-                        if split_dim == 0:
+                        chunk_slice = slice(j * chunk_size, (j + 1) * chunk_size)
-                            weight_to_load = loaded_weight[
+                        if loaded_weight.ndim == 1:
-                                j * chunk_size : (j + 1) * chunk_size, :
+                            weight_to_load = loaded_weight[chunk_slice]
-                            ]
+                        elif split_dim == 0:
                            weight_to_load = loaded_weight[chunk_slice, :]
                        else:
-                            weight_to_load = loaded_weight[
+                            weight_to_load = loaded_weight[:, chunk_slice]
                                :, j * chunk_size : (j + 1) * chunk_size
                            ]
                        # Synthesize an expert-style name so expert mapping
                        # can route it
                        chunk_name = name.replace(
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@ -94,6 +94,12 @@ class SupportsMultiModal(Protocol):
    `multimodal_config.mm_encoder_tp_mode="data"`.
    """
    requires_raw_input_tokens: ClassVar[bool] = False
    """
    A flag that indicates this model processes input id tokens
    in their raw form and not input embeddings.
    """
    merge_by_field_config: ClassVar[bool | None] = None
    """
    [DEPRECATED] A flag that indicates which implementation of
@ -306,6 +312,10 @@ def supports_multimodal_raw_input_only(model: type[object] | object) -> bool:
    return getattr(model, "supports_multimodal_raw_input_only", False)
 def requires_raw_input_tokens(model: type[object] | object) -> bool:
    return getattr(model, "requires_raw_input_tokens", False)
 def supports_multimodal_encoder_tp_data(model: type[object] | object) -> bool:
    return getattr(model, "supports_encoder_tp_data", False)
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@ -139,7 +139,7 @@ class MiniCPMVImageEmbeddingInputs(TensorSchema):
    type: Literal["image_embeds"]
    image_embeds: Annotated[
        torch.Tensor | list[torch.Tensor],
-        TensorShape("bn", "ns", "hs"),
+        TensorShape("bn", "ns", "hs", dynamic_dims={"ns"}),
    ]
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@ -101,7 +101,7 @@ class Qwen2AudioEmbeddingInputs(TensorSchema):
    audio_embeds: Annotated[
        list[torch.Tensor],
-        TensorShape("bn", "naf", "hs"),
+        TensorShape("bn", "naf", "hs", dynamic_dims={"naf"}),
    ]
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@ -118,7 +118,7 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor):
    output_lengths = (
        ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
    )
-    return feat_lengths, output_lengths
+    return output_lengths
 class Qwen3_VisionPatchEmbed(nn.Module):
@ -921,13 +921,11 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
        if audio_feature_lengths is None and feature_attention_mask is None:
            audio_output_lengths = []
        elif audio_feature_lengths is not None:
-            _, audio_output_lens = _get_feat_extract_output_lengths(
+            audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths)
                audio_feature_lengths
            )
            audio_output_lengths = audio_output_lens.tolist()
        elif feature_attention_mask is not None:
            assert isinstance(feature_attention_mask, torch.Tensor)
-            _, audio_output_lens = _get_feat_extract_output_lengths(
+            audio_output_lens = _get_feat_extract_output_lengths(
                feature_attention_mask.sum(-1)
            )
            audio_output_lengths = audio_output_lens.tolist()
@ -1111,18 +1109,16 @@ class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMix
        audio_input: Qwen2_5OmniAudioFeatureInputs,
        audio_hashes: list[str] | None = None,
        cached_audio_features: torch.Tensor | None = None,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, ...]:
        input_features = audio_input["input_features"]
        audio_feature_lengths = audio_input["audio_feature_lengths"]
-        audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(
+        audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths)
            audio_feature_lengths
        )
        audio_outputs = self.audio_tower(
            input_features.to(self.audio_tower.dtype),
            feature_lens=audio_feature_lengths,
-            aftercnn_lens=audio_feat_lengths,
+            aftercnn_lens=audio_output_lengths,
        )
        audio_features = audio_outputs.last_hidden_state
        return audio_features.split(audio_output_lengths.tolist())
@ -1579,7 +1575,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
                    + st_idx
                )
                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                _, audio_len = _get_feat_extract_output_lengths(
+                audio_len = _get_feat_extract_output_lengths(
                    audio_feature_lengths[audio_idx]
                )
                llm_pos_ids = (
@ -1700,7 +1696,7 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
                llm_pos_ids_list.append(bos_block)
                llm_pos_ids_list.append(bos_block)
                st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0
-                _, audio_len = _get_feat_extract_output_lengths(
+                audio_len = _get_feat_extract_output_lengths(
                    audio_feature_lengths[audio_idx]
                )
                audio_llm_pos_ids = (
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -46,6 +46,7 @@ from .interfaces import (
    has_noops,
    is_attention_free,
    is_hybrid,
    requires_raw_input_tokens,
    supports_cross_encoding,
    supports_mamba_prefix_caching,
    supports_multimodal,
@ -422,6 +423,7 @@ _MULTIMODAL_MODELS = {
    ),
    "UltravoxModel": ("ultravox", "UltravoxModel"),
    "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"),  # noqa: E501
    "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"),  # noqa: E501
    # [Encoder-decoder]
    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
@ -539,6 +541,7 @@ class _ModelInfo:
    supports_cross_encoding: bool
    supports_multimodal: bool
    supports_multimodal_raw_input_only: bool
    requires_raw_input_tokens: bool
    supports_multimodal_encoder_tp_data: bool
    supports_pp: bool
    has_inner_state: bool
@ -562,6 +565,7 @@ class _ModelInfo:
            supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
                model
            ),
            requires_raw_input_tokens=requires_raw_input_tokens(model),
            supports_multimodal_encoder_tp_data=supports_multimodal_encoder_tp_data(
                model
            ),
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@ -163,8 +163,10 @@ def apply_rotary_pos_emb(
        enable_fp32_compute=True,
    )
-    if is_flash_attn_backend and not current_platform.is_cuda():
+    if is_flash_attn_backend and current_platform.is_cuda():
        apply_rotary_emb_func = apply_rotary_emb.forward_cuda
    elif is_flash_attn_backend and current_platform.is_rocm():
        apply_rotary_emb_func = apply_rotary_emb.forward_hip
    else:
        apply_rotary_emb_func = apply_rotary_emb.forward_native
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import inspect
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
@ -116,10 +117,7 @@ class VoxtralProcessorAdapter:
        self,
        audio_length: int,
    ) -> int:
-        pad_audio_length = self._audio_processor.next_multiple_of_chunk_frames(
+        return ceil(audio_length / (self.sampling_rate // self.frame_rate))
            audio_length, self.sampling_rate
        )
        return ceil(pad_audio_length / (self.sampling_rate // self.frame_rate))
    def __call__(
        self,
@ -158,7 +156,14 @@ class VoxtralProcessorAdapter:
            assert audio.ndim == 1
            # pad if necessary
-            audio = self._audio_processor.pad(audio, self.sampling_rate)
+            # TODO(Patrick) - remove once mistral-common is bumped
            sig = inspect.signature(self._audio_processor.pad)
            if "is_online_streaming" in sig.parameters:
                audio = self._audio_processor.pad(
                    audio, self.sampling_rate, is_online_streaming=False
                )
            else:
                audio = self._audio_processor.pad(audio, self.sampling_rate)
            audio_tokens = [self.begin_audio_token_id] + [
                self.audio_token_id
@ -510,6 +515,7 @@ class VoxtralForConditionalGeneration(
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        remapping_rules = [
            (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
            (r"mm_whisper_embeddings\.(.*)", r"\1"),
            (r"audio_language_projection\.(.*)", r"audio_language_adapter.\1"),
            (
@ -535,13 +541,16 @@ class VoxtralForConditionalGeneration(
        def llm_weights_generator():
            nonlocal loaded_weights
            for name, w in weights:
-                is_encoder = (
+                is_encoder = False
-                    name.startswith("mm_whisper_embeddings")
+                for k in [
-                    and not name.startswith("mm_whisper_embeddings.tok_embeddings")
+                    "mm_whisper_embeddings",
-                    and not name.startswith(
+                    "mm_streams_embeddings.embedding_module",
-                        "mm_whisper_embeddings.audio_language_projection"
+                ]:
                    is_encoder |= (
                        name.startswith(k)
                        and not name.startswith(f"{k}.tok_embeddings")
                        and not name.startswith(f"{k}.audio_language_projection")
                    )
                )
                for pattern, repl in remapping_rules:
                    if re.fullmatch(pattern, name):
@ -676,6 +685,7 @@ class VoxtralEncoderModel(nn.Module):
    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
    mistral_remapping = [
        (r"mm_streams_embeddings.embedding_module\.(.*)", r"\1"),
        (
            r"whisper_encoder\.conv_layers\.0\.(weight|bias)",
            r"whisper_encoder.conv1.\1",
@ -684,6 +694,14 @@ class VoxtralEncoderModel(nn.Module):
            r"whisper_encoder\.conv_layers\.1\.(weight|bias)",
            r"whisper_encoder.conv2.\1",
        ),
        (
            r"whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)",
            r"whisper_encoder.conv1.\1",
        ),  # noqa: E501
        (
            r"whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)",
            r"whisper_encoder.conv2.\1",
        ),  # noqa: E501
        (
            r"whisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)",  # noqa: E501
            r"whisper_encoder.layers.\1.self_attn.\2_proj.\3",
--- a/vllm/model_executor/models/voxtral_streaming.py
+++ b/vllm/model_executor/models/voxtral_streaming.py
@ -0,0 +1,243 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import math
 from collections.abc import Mapping
 import torch
 from vllm.config.vllm import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.voxtral import (
    VoxtralDummyInputsBuilder,
    VoxtralForConditionalGeneration,
    VoxtralMultiModalProcessor,
    VoxtralProcessingInfo,
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import _I, BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (
    MultiModalKwargsOptionalItems,
 )
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (
    MultiModalPromptUpdates,
    PlaceholderFeaturesInfo,
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from .utils import (
    _flatten_embeddings,
 )
 logger = init_logger(__name__)
 class VoxtralStreamingMultiModalProcessor(VoxtralMultiModalProcessor):
    def __init__(
        self,
        info: _I,
        dummy_inputs: BaseDummyInputsBuilder[_I],
        *,
        cache: BaseMultiModalProcessorCache | None = None,
    ) -> None:
        # streaming can't make use of a cache yet
        super().__init__(info, dummy_inputs, cache=None)
    def _maybe_apply_prompt_updates(
        self,
        mm_items: MultiModalDataItems,
        prompt_ids: list[int],
        mm_kwargs: MultiModalKwargsOptionalItems,
        mm_prompt_updates: MultiModalPromptUpdates,
        is_update_applied: bool,
    ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]:
        # there are no placeholder audio tokens for streaming
        # so we need to build the place placeholder positions manually
        # in streaming there is always only one audio input
        audios = mm_kwargs.get("audio", [])
        assert len(audios) == 1, (
            f"Expected only one audio input for streaming, got {mm_kwargs=}"
        )
        tokenizer = self.info.get_tokenizer()
        audio_config = tokenizer.instruct.audio_encoder.audio_config
        num_audio_samples = audios[0]["audio_arrays"].data.shape[0]
        length = audio_config.num_audio_tokens(num_audio_samples)
        features_info = PlaceholderFeaturesInfo(
            modality="audio",
            item_idx=0,
            start_idx=0,
            tokens=length
            * [0],  # only used for length computation, so we can take dummy inputs
            is_embed=None,
        )
        return prompt_ids, {"audio": [features_info]}
 class TimeEmbedding(torch.nn.Module):
    """Sinusoidal Embedding for encoding time"""
    def __init__(self, dim: int, theta: float = 10000.0) -> None:
        super().__init__()
        self.dim = dim
        self.theta = theta
        inv_freq = torch.exp(
            -math.log(self.theta)
            * torch.arange(self.dim // 2).float()
            / (self.dim // 2)
        )
        self.register_buffer("inv_freq", inv_freq, persistent=False)
    def forward(self, t: torch.Tensor) -> torch.Tensor:
        t = t[..., None]  # (B,) -> (B, 1) or (B, T) -> (B, T, 1)
        inv_freq = self.inv_freq.to(device=t.device, dtype=t.dtype)
        emb = (
            t * inv_freq
        )  # (B, 1) x (D/2,) -> (B, D/2) or (B, T, 1) x (D/2,) -> (B, T, D/2)
        return torch.cat((emb.cos(), emb.sin()), dim=-1)  # (B, D) or (B, T, D)
@MULTIMODAL_REGISTRY.register_processor(
    VoxtralStreamingMultiModalProcessor,
    info=VoxtralProcessingInfo,
    dummy_inputs=VoxtralDummyInputsBuilder,
 )
 class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
    requires_raw_input_tokens = True
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)
        self.time_embedding: TimeEmbedding = TimeEmbedding(
            dim=self.config.text_config.hidden_size
        )
        audio_config = self.tokenizer.instruct.audio_encoder.audio_config
        _n_delay_tokens = (
            audio_config.frame_rate * audio_config.transcription_delay_ms / 1000
        )
        assert _n_delay_tokens.is_integer(), (
            f"n_delay_tokens must be integer, got {_n_delay_tokens}"
        )
        self.n_delay_tokens = int(_n_delay_tokens)
    @property
    def audio_config(self):
        return self.tokenizer.instruct.audio_encoder.audio_config
    def embed_input_ids(
        self,
        input_ids: torch.Tensor,
        multimodal_embeddings: MultiModalEmbeddings | None = None,
        *,
        is_multimodal: torch.Tensor | None = None,
        # Multi-modal token ID may exceed vocab size
        handle_oov_mm_token: bool = True,
    ) -> torch.Tensor:
        """Pass post-conv embeddings directly as input"""
        # for streaming we simply flatten the multimodal embeddings
        # to be in tensor format, we treat the input ids later
        assert multimodal_embeddings is not None
        assert len(multimodal_embeddings) > 0, (
            "For streaming you must provide a multimodal_embedding at every step."
        )
        mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
        return mm_embeds_flat
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
        **kwargs: object,
    ) -> torch.Tensor | IntermediateTensors:
        assert inputs_embeds is not None
        assert input_ids is not None
        pool_size = self.config.audio_config.block_pool_size
        inputs_embeds = inputs_embeds.view(
            inputs_embeds.shape[0] * pool_size, inputs_embeds.shape[1] // pool_size
        )
        audio_hidden_states = self.whisper_encoder.whisper_encoder.forward_layers(
            inputs_embeds
        )
        num_tokens, audio_hidden_size = audio_hidden_states.shape
        assert num_tokens % self.downsample_factor == 0
        audio_hidden_states = audio_hidden_states.reshape(
            num_tokens // self.downsample_factor,
            audio_hidden_size * self.downsample_factor,
        )
        audio_text_embeds = self.audio_language_adapter(audio_hidden_states)
        text_embeds = self.language_model.embed_input_ids(input_ids)
        # sum pool text and audio embeddings
        inputs_embeds = audio_text_embeds + text_embeds
        time_tensor = torch.tensor(
            [self.n_delay_tokens],
            device=inputs_embeds.device,
            dtype=inputs_embeds.dtype,
        )
        inputs_embeds = inputs_embeds + self.time_embedding(time_tensor)
        hidden_states = self.language_model.model(
            input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds
        )
        return hidden_states
    def embed_multimodal(
        self, **kwargs
    ) -> list[torch.Tensor] | torch.Tensor | tuple[torch.Tensor, ...] | None:
        """Transform audio waveforms -> initial whisper post-conv embeddings"""
        audio_inputs = self._parse_and_validate_audio_arrays(**kwargs)
        assert audio_inputs is not None, (
            "For streaming you must provide an audio input at every step."
        )
        multiple_of = self.audio_config.raw_audio_length_per_tok
        assert all(
            (this_audio := audio.shape[0]) % multiple_of == 0 for audio in audio_inputs
        ), (
            f"Every input audio waveform has to be a multiple of {multiple_of}, but"
            f" one is {this_audio} with {(this_audio / multiple_of)=}."
        )
        mel_features = [
            self.whisper_encoder.compute_whisper_melspec(audio).to(
                self.whisper_encoder.dtype
            )
            for audio in audio_inputs
        ]
        seq_lens = [mel.shape[1] for mel in mel_features]
        # [total_num_20ms_frames, hidden_size]
        audio_embeddings = self.whisper_encoder.whisper_encoder.forward_conv(
            mel_features
        )[0]
        conv_stride = self.whisper_encoder.whisper_encoder.total_stride
        audio_embeddings_per_sample = audio_embeddings.split(
            [s // conv_stride for s in seq_lens], dim=0
        )
        # audio_embeddings per sample need to be divisible by 4
        pool_size = self.config.audio_config.block_pool_size
        assert all(
            (this_shape := sample.shape[0]) % pool_size == 0
            for sample in audio_embeddings_per_sample
        ), f"Every audio embedding has to be a multiple of 4, but one is {this_shape}."
        audio_embeddings_per_sample = [
            e.view(e.shape[0] // pool_size, e.shape[1] * pool_size)
            for e in audio_embeddings_per_sample
        ]
        return audio_embeddings_per_sample
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import enum
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from contextlib import nullcontext
 from functools import partial
 from typing import Annotated, Literal, cast
 import numpy as np
@ -16,7 +18,10 @@ from transformers import (
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
-from vllm.attention.layer import Attention, AttentionType
+from vllm.attention.backends.abstract import (
    AttentionType,
 )
 from vllm.attention.layer import Attention
 from vllm.attention.layers.cross_attention import CrossAttention
 from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
 from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
@ -34,6 +39,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.whisper_utils import (
    ISO639_1_SUPPORTED_LANGS,
    WhisperAttentionWithBlockPooling,
    WhisperCausalConv1d,
 )
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
    MultiModalDataDict,
@ -64,67 +74,11 @@ from .utils import (
 logger = init_logger(__name__)
 # From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
-ISO639_1_SUPPORTED_LANGS = {
+class WhisperPosEmbedType(enum.Enum):
-    "af": "Afrikaans",
+    SINUSOIDAL = "sinusoidal"
-    "ar": "Arabic",
+    NOPE = "nope"
-    "hy": "Armenian",
+    LEARNED = "learned"
    "az": "Azerbaijani",
    "be": "Belarusian",
    "bs": "Bosnian",
    "bg": "Bulgarian",
    "ca": "Catalan",
    "zh": "Chinese",
    "hr": "Croatian",
    "cs": "Czech",
    "da": "Danish",
    "nl": "Dutch",
    "en": "English",
    "et": "Estonian",
    "fi": "Finnish",
    "fr": "French",
    "gl": "Galician",
    "de": "German",
    "el": "Greek",
    "he": "Hebrew",
    "hi": "Hindi",
    "hu": "Hungarian",
    "is": "Icelandic",
    "id": "Indonesian",
    "it": "Italian",
    "ja": "Japanese",
    "kn": "Kannada",
    "kk": "Kazakh",
    "ko": "Korean",
    "lv": "Latvian",
    "lt": "Lithuanian",
    "mk": "Macedonian",
    "ms": "Malay",
    "mr": "Marathi",
    "mi": "Maori",
    "ne": "Nepali",
    "no": "Norwegian",
    "fa": "Persian",
    "pl": "Polish",
    "pt": "Portuguese",
    "ro": "Romanian",
    "ru": "Russian",
    "sr": "Serbian",
    "sk": "Slovak",
    "sl": "Slovenian",
    "es": "Spanish",
    "sw": "Swahili",
    "sv": "Swedish",
    "tl": "Tagalog",
    "ta": "Tamil",
    "th": "Thai",
    "tr": "Turkish",
    "uk": "Ukrainian",
    "ur": "Urdu",
    "vi": "Vietnamese",
    "cy": "Welsh",
 }
 class WhisperAudioInputs(TensorSchema):
@ -184,6 +138,8 @@ class WhisperAttention(nn.Module):
        num_heads: int,
        bias: bool = True,
        attn_type: AttentionType = AttentionType.DECODER,
        per_layer_sliding_window: int | None = None,
        block_pool_size: int = 1,
        cache_config: CacheConfig | None = None,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
@ -242,7 +198,14 @@ class WhisperAttention(nn.Module):
                attn_type=self.attn_type,
            )
        else:  # AttentionType.DECODER (regular decoder self-attention)
-            self.attn = Attention(
+            if block_pool_size > 1:
                attn_cls = partial(
                    WhisperAttentionWithBlockPooling, block_pool_size=block_pool_size
                )
            else:
                attn_cls = Attention
            self.attn = attn_cls(
                self.num_heads,
                self.head_dim,
                self.scaling,
@ -251,6 +214,7 @@ class WhisperAttention(nn.Module):
                quant_config=quant_config,
                prefix=f"{prefix}.attn",
                attn_type=self.attn_type,
                per_layer_sliding_window=per_layer_sliding_window,
            )
    def _init_qkv(
@ -386,6 +350,9 @@ class WhisperEncoderLayer(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        is_causal = getattr(config, "is_causal", False)
        sliding_window = getattr(config, "sliding_window", None)
        block_pool_size = getattr(config, "block_pool_size", 1)
        cache_config = vllm_config.cache_config
        quant_config = vllm_config.quant_config
@ -393,7 +360,9 @@ class WhisperEncoderLayer(nn.Module):
        self.self_attn = WhisperAttention(
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
-            attn_type=AttentionType.ENCODER,
+            attn_type=AttentionType.DECODER if is_causal else AttentionType.ENCODER,
            block_pool_size=block_pool_size,
            per_layer_sliding_window=sliding_window,
            cache_config=cache_config,
            quant_config=quant_config,
            prefix=f"{prefix}.self_attn",
@ -492,12 +461,21 @@ class WhisperEncoder(nn.Module):
        super().__init__()
        config = vllm_config.model_config.hf_config
        embed_dim = config.d_model
        self.pos_embed_type = WhisperPosEmbedType(
            getattr(config, "pos_embed", "sinusoidal")
        )
        self.num_mel_bins = config.num_mel_bins
        self.max_source_positions = config.max_source_positions
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
+        is_causal = getattr(config, "is_causal", False)
-        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+        Conv1d = WhisperCausalConv1d if is_causal else partial(nn.Conv1d, padding=1)
        self.conv1 = Conv1d(self.num_mel_bins, embed_dim, kernel_size=3)
        self.conv2 = Conv1d(embed_dim, embed_dim, stride=2, kernel_size=3)
        self.total_stride = self.conv1.stride[0] * self.conv2.stride[0]
        self.start_layer, self.end_layer, self.layers = make_layers(
            config.encoder_layers,
            lambda prefix: WhisperEncoderLayer(
@ -507,29 +485,54 @@ class WhisperEncoder(nn.Module):
        )
        self.layer_norm = nn.LayerNorm(config.d_model)
-        maybe_fp32_init_ctx = (
+        if is_causal and self.pos_embed_type != WhisperPosEmbedType.NOPE:
-            set_default_torch_dtype(torch.float32) if init_in_fp32 else nullcontext()
+            raise ValueError(
-        )
+                "Only NOPE position embeddings are supported "
-
+                f"for causal models, but got {self.pos_embed_type}"
-        with (
+            )
-            torch.no_grad(),
+        elif self.pos_embed_type in (
-            maybe_fp32_init_ctx,
+            WhisperPosEmbedType.SINUSOIDAL,
            WhisperPosEmbedType.LEARNED,
        ):
-            self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
+            maybe_fp32_init_ctx = (
-            self.embed_positions.weight.copy_(
+                set_default_torch_dtype(torch.float32)
-                sinusoids(*self.embed_positions.weight.shape)
+                if init_in_fp32
                else nullcontext()
            )
-    def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
+            with (
                torch.no_grad(),
                maybe_fp32_init_ctx,
            ):
                self.embed_positions = nn.Embedding(
                    self.max_source_positions, embed_dim
                )
                self.embed_positions.weight.copy_(
                    sinusoids(*self.embed_positions.weight.shape)
                )
    def forward_conv(
        self, input_features: torch.Tensor | list[torch.Tensor]
    ) -> torch.Tensor:
        hidden_states = []
        input_is_batched = False
        for features in input_features:
            embeds = nn.functional.gelu(self.conv1(features))
            embeds = nn.functional.gelu(self.conv2(embeds))
-            embeds = embeds.transpose(-1, -2)
+
-            embeds = (embeds + self.embed_positions.weight[: embeds.size(-2), :]).to(
+            if self.pos_embed_type in (
-                embeds.dtype
+                WhisperPosEmbedType.SINUSOIDAL,
-            )
+                WhisperPosEmbedType.LEARNED,
            ):
                embeds = embeds.transpose(-1, -2)
                embeds = (
                    embeds + self.embed_positions.weight[: embeds.size(-2), :]
                ).to(embeds.dtype)
            elif self.pos_embed_type == WhisperPosEmbedType.NOPE:
                embeds = embeds.transpose(-1, -2).to(embeds.dtype)
            else:
                raise ValueError(f"Unknown pos_embed_type: {self.pos_embed_type}")
            hidden_states.append(embeds)
            input_is_batched = embeds.ndim > 2
        # Input to MHA must be B x T x D
@ -539,12 +542,19 @@ class WhisperEncoder(nn.Module):
        else:
            hidden_states = torch.stack(hidden_states, dim=0)
        return hidden_states
    def forward_layers(self, hidden_states: torch.Tensor) -> torch.Tensor:
        for encoder_layer in self.layers:
            hidden_states = encoder_layer(hidden_states)
        hidden_states = self.layer_norm(hidden_states)
        return hidden_states
    def forward(self, input_features: torch.Tensor | list[torch.Tensor]):
        hidden_states = self.forward_conv(input_features)
        return self.forward_layers(hidden_states)
 class WhisperDecoder(nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
--- a/vllm/model_executor/models/whisper_utils.py
+++ b/vllm/model_executor/models/whisper_utils.py
@ -0,0 +1,299 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
 import functools
 import math
 from dataclasses import replace
 import torch
 import torch.nn.functional as F
 from torch import nn
 from vllm.attention.backends.abstract import (
    AttentionBackend,
    AttentionMetadata,
    AttentionType,
 )
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.attention.backends.utils import (
    CommonAttentionMetadata,
    subclass_attention_backend_with_overrides,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
 # From https://platform.openai.com/docs/guides/speech-to-text/supported-languages
 ISO639_1_SUPPORTED_LANGS = {
    "af": "Afrikaans",
    "ar": "Arabic",
    "hy": "Armenian",
    "az": "Azerbaijani",
    "be": "Belarusian",
    "bs": "Bosnian",
    "bg": "Bulgarian",
    "ca": "Catalan",
    "zh": "Chinese",
    "hr": "Croatian",
    "cs": "Czech",
    "da": "Danish",
    "nl": "Dutch",
    "en": "English",
    "et": "Estonian",
    "fi": "Finnish",
    "fr": "French",
    "gl": "Galician",
    "de": "German",
    "el": "Greek",
    "he": "Hebrew",
    "hi": "Hindi",
    "hu": "Hungarian",
    "is": "Icelandic",
    "id": "Indonesian",
    "it": "Italian",
    "ja": "Japanese",
    "kn": "Kannada",
    "kk": "Kazakh",
    "ko": "Korean",
    "lv": "Latvian",
    "lt": "Lithuanian",
    "mk": "Macedonian",
    "ms": "Malay",
    "mr": "Marathi",
    "mi": "Maori",
    "ne": "Nepali",
    "no": "Norwegian",
    "fa": "Persian",
    "pl": "Polish",
    "pt": "Portuguese",
    "ro": "Romanian",
    "ru": "Russian",
    "sr": "Serbian",
    "sk": "Slovak",
    "sl": "Slovenian",
    "es": "Spanish",
    "sw": "Swahili",
    "sv": "Swedish",
    "tl": "Tagalog",
    "ta": "Tamil",
    "th": "Thai",
    "tr": "Turkish",
    "uk": "Ukrainian",
    "ur": "Urdu",
    "vi": "Vietnamese",
    "cy": "Welsh",
 }
 def _pad1d(
    x: torch.Tensor,
    paddings: tuple[int, int],
    mode: str = "constant",
    value: float = 0.0,
 ) -> torch.Tensor:
    """Tiny wrapper around F.pad, just to allow for
    reflect padding on small input.
    If this is the case, we insert extra 0 padding
    to the right before the reflection happen.
    """
    length = x.shape[-1]
    padding_left, padding_right = paddings
    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
    if mode == "reflect":
        max_pad = max(padding_left, padding_right)
        extra_pad = 0
        if length <= max_pad:
            extra_pad = max_pad - length + 1
            x = F.pad(x, (0, extra_pad))
        padded = F.pad(x, paddings, mode, value)
        end = padded.shape[-1] - extra_pad
        return padded[..., :end]
    else:
        return F.pad(x, paddings, mode, value)
 class WhisperCausalConv1d(nn.Conv1d):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int,
        stride: int = 1,
        padding: int = 0,
        bias: bool = True,
    ) -> None:
        super().__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            bias=bias,
        )
        self._stride = self.stride[0]
        self._effective_kernel_size = (kernel_size - 1) * self.dilation[0] + 1
        self._padding_total = self._effective_kernel_size - self._stride
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        n_frames = (
            x.shape[-1] - self._effective_kernel_size + self._padding_total
        ) / self._stride + 1
        target_length = (math.ceil(n_frames) - 1) * self._stride + (
            self._effective_kernel_size - self._padding_total
        )
        extra_padding = target_length - x.shape[-1]
        x = _pad1d(x, (self._padding_total, extra_padding), mode="constant")
        return super().forward(x)
@functools.lru_cache
 def create_whisper_attention_backend_with_block_pooling(
    underlying_attn_backend: AttentionBackend, block_pool_size: int
 ) -> type[AttentionBackend]:
    prefix = "WhisperAttentionWithBlockPooling_"
    underlying_builder = underlying_attn_backend.get_builder_cls()
    class WhisperAttentionWithBlockPoolingBuilder(underlying_builder):  # type: ignore
        def __init__(
            self,
            kv_cache_spec: AttentionSpec,
            layer_names: list[str],
            vllm_config: VllmConfig,
            device: torch.device,
        ):
            assert kv_cache_spec.num_kv_heads % block_pool_size == 0
            kv_cache_spec = replace(
                kv_cache_spec,
                block_size=kv_cache_spec.block_size * block_pool_size,
                num_kv_heads=kv_cache_spec.num_kv_heads // block_pool_size,
            )
            super().__init__(kv_cache_spec, layer_names, vllm_config, device)
        def build(
            self,
            common_prefix_len: int,
            common_attn_metadata: CommonAttentionMetadata,
            fast_build: bool = False,
        ) -> AttentionMetadata:
            new_common_attn_metadata = copy.deepcopy(common_attn_metadata)
            new_common_attn_metadata.query_start_loc *= block_pool_size
            new_common_attn_metadata.query_start_loc_cpu *= block_pool_size
            new_common_attn_metadata.seq_lens *= block_pool_size
            new_common_attn_metadata._seq_lens_cpu *= block_pool_size
            new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
            new_common_attn_metadata.num_actual_tokens *= block_pool_size
            new_common_attn_metadata.max_query_len *= block_pool_size
            new_common_attn_metadata.max_seq_len *= block_pool_size
            original_slot_mapping = common_attn_metadata.slot_mapping
            common_prefix_len *= block_pool_size
            new_common_attn_metadata.slot_mapping = (
                (
                    original_slot_mapping.unsqueeze(1) * block_pool_size
                    + torch.arange(block_pool_size, device=original_slot_mapping.device)
                )
                .flatten()
                .clamp(min=-1)
            )
            return super().build(
                common_prefix_len, new_common_attn_metadata, fast_build
            )
    if not issubclass(underlying_attn_backend, FlashAttentionBackend):
        raise NotImplementedError(
            f"{underlying_attn_backend} is not yet supported."
            "Contributions to support more backends are much "
            "appreciated."
        )
    attn_backend = subclass_attention_backend_with_overrides(
        name_prefix=prefix,
        attention_backend_cls=underlying_attn_backend,
        overrides={
            "get_builder_cls": lambda: WhisperAttentionWithBlockPoolingBuilder,
            "get_kv_cache_shape": lambda num_blocks,
            block_size,
            num_kv_heads,
            head_size,
            cache_dtype_str: (
                2,
                num_blocks,
                # we stretch each block by `block_pool_size`
                block_size * block_pool_size,
                num_kv_heads // block_pool_size,
                head_size,
            ),  # TODO: generalize to other backends
        },
    )
    return attn_backend
 class WhisperAttentionWithBlockPooling(Attention):
    """Attention layer with block pooling."""
    def __init__(
        self,
        num_heads: int,
        head_size: int,
        scale: float,
        num_kv_heads: int | None = None,
        alibi_slopes: list[float] | None = None,
        cache_config: CacheConfig | None = None,
        quant_config: QuantizationConfig | None = None,
        logits_soft_cap: float | None = None,
        per_layer_sliding_window: int | None = None,
        prefix: str = "",
        attn_type: str = AttentionType.DECODER,
        kv_sharing_target_layer_name: str | None = None,
        block_pool_size: int = 1,
        attn_backend: type[AttentionBackend] | None = None,
        **extra_impl_args,
    ) -> None:
        self.block_pool_size = block_pool_size
        dtype = torch.get_default_dtype()
        if cache_config is not None:
            kv_cache_dtype = cache_config.cache_dtype
            block_size = cache_config.block_size
        else:
            kv_cache_dtype = "auto"
            block_size = 16
        underlying_attn_backend = get_attn_backend(
            head_size,
            dtype,
            kv_cache_dtype,
            block_size,
            attn_type=attn_type,
        )
        attn_backend = create_whisper_attention_backend_with_block_pooling(
            underlying_attn_backend, block_pool_size
        )
        super().__init__(
            num_heads=num_heads,
            head_size=head_size,
            scale=scale,
            num_kv_heads=num_kv_heads,
            alibi_slopes=alibi_slopes,
            cache_config=cache_config,
            quant_config=quant_config,
            logits_soft_cap=logits_soft_cap,
            per_layer_sliding_window=per_layer_sliding_window,
            prefix=prefix,
            attn_type=attn_type,
            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
            attn_backend=attn_backend,
            **extra_impl_args,
        )
    def get_kv_cache_spec(self, vllm_config: VllmConfig):
        kv_cache_spec = super().get_kv_cache_spec(vllm_config)
        assert isinstance(kv_cache_spec, AttentionSpec)
        kv_cache_spec = replace(
            kv_cache_spec,
            num_kv_heads=self.block_pool_size * kv_cache_spec.num_kv_heads,
        )
        return kv_cache_spec
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@ -111,11 +111,16 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
        return librosa.load(filepath, sr=None)
-    def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
+    def encode_base64(
        self,
        media: tuple[npt.NDArray, int],
        *,
        audio_format: str = "WAV",
    ) -> str:
        audio, sr = media
        with BytesIO() as buffer:
-            soundfile.write(buffer, audio, sr, format="WAV")
+            soundfile.write(buffer, audio, sr, format=audio_format)
            data = buffer.getvalue()
        return base64.b64encode(data).decode("utf-8")
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@ -8,8 +8,12 @@ import pybase64
 import torch
 from PIL import Image
 from vllm.logger import init_logger
 from .base import MediaIO, MediaWithBytes
 logger = init_logger(__file__)
 def rescale_image_size(
    image: Image.Image, size_factor: float, transpose: int = -1
@ -104,8 +108,17 @@ class ImageMediaIO(MediaIO[Image.Image]):
        self,
        media: Image.Image,
        *,
-        image_format: str = "JPEG",
+        image_format: str | None = None,
    ) -> str:
        if image_format is None:
            logger.warning_once(
                "The default format of `ImageMediaIO.encode_base64` will be changed "
                'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
                "To continue using the old default, "
                'pass `format="JPEG"` explicitly to silence this warning.'
            )
            image_format = "JPEG"
        image = media
        with BytesIO() as buffer:
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@ -3,6 +3,7 @@
 import asyncio
 import atexit
 import mimetypes
 from collections.abc import Generator, Set
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
@ -357,17 +358,31 @@ class MediaConnector:
 def encode_audio_base64(
    audio: np.ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
 ) -> str:
    """Encode audio as base64."""
    audio_io = AudioMediaIO()
-    return audio_io.encode_base64((audio, sampling_rate))
+    return audio_io.encode_base64((audio, sampling_rate), audio_format=format)
 def encode_audio_url(
    audio: np.ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
 ) -> str:
    """Encode audio as a data URL."""
    audio_b64 = encode_audio_base64(audio, sampling_rate, format=format)
    mimetype = mimetypes.types_map.get("." + format.lower(), "audio")
    return f"data:{mimetype};base64,{audio_b64}"
 def encode_image_base64(
    image: Image.Image,
    *,
    image_mode: str = "RGB",
-    format: str = "JPEG",
+    format: str | None = None,
 ) -> str:
    """
    Encode a pillow image to base64 format.
@ -378,10 +393,45 @@ def encode_image_base64(
    return image_io.encode_base64(image, image_format=format)
-def encode_video_base64(frames: npt.NDArray) -> str:
+def encode_image_url(
    image: Image.Image,
    *,
    image_mode: str = "RGB",
    format: str = "PNG",
 ) -> str:
    """
    Encode a pillow image as a data URL.
    By default, the image is converted into RGB format before being encoded.
    """
    image_b64 = encode_image_base64(image, image_mode=image_mode, format=format)
    mimetype = mimetypes.types_map.get("." + format.lower(), "image")
    return f"data:{mimetype};base64,{image_b64}"
 def encode_video_base64(
    frames: npt.NDArray,
    *,
    format: str = "JPEG",
 ) -> str:
    image_io = ImageMediaIO()
    video_io = VideoMediaIO(image_io)
-    return video_io.encode_base64(frames)
+    return video_io.encode_base64(frames, video_format=format)
 def encode_video_url(
    frames: npt.NDArray,
    *,
    format: str = "JPEG",
 ) -> str:
    video_b64 = encode_video_base64(frames, format=format)
    if format.lower() == "jpeg":
        mimetype = "video/jpeg"
    else:
        mimetype = mimetypes.types_map.get("." + format.lower(), "video")
    return f"data:{mimetype};base64,{video_b64}"
 def argsort_mm_positions(
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@ -7,7 +7,6 @@ from typing import TYPE_CHECKING, Optional
 import torch
 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
@ -168,32 +167,6 @@ class XPUPlatform(Platform):
        if vllm_config.kv_transfer_config is not None:
            vllm_config.kv_transfer_config.enable_permute_local_kv = True
        if parallel_config.distributed_executor_backend is None:
            if parallel_config.world_size > 1:
                parallel_config.distributed_executor_backend = "ray"
            else:
                parallel_config.distributed_executor_backend = "uni"
        elif parallel_config.distributed_executor_backend == "mp":
            # FIXME(kunshang):
            # spawn needs calling `if __name__ == '__main__':`
            # fork is not supported for xpu start new process.
            if envs.VLLM_WORKER_MULTIPROC_METHOD != "spawn":
                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
                logger.warning(
                    "Please use spawn as start method if you want to use mp."
                )
        elif (
            parallel_config.distributed_executor_backend != "ray"
            and parallel_config.distributed_executor_backend != "uni"
            and parallel_config.distributed_executor_backend != "external_launcher"
        ):
            logger.warning(
                "%s is not supported on XPU, fallback to ray distributed"
                " executor backend.",
                parallel_config.distributed_executor_backend,
            )
            parallel_config.distributed_executor_backend = "ray"
        if model_config and model_config.use_mla:
            logger.info(
                "MLA is enabled on a non-GPU platform; forcing chunked "
--- a/vllm/tool_parsers/mistral_tool_parser.py
+++ b/vllm/tool_parsers/mistral_tool_parser.py
@ -131,78 +131,105 @@ class MistralToolParser(ToolParser):
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        """
-        Extract the tool calls from a complete model response. Requires
+        Extract the tool calls from a complete model response.
-        find-and-replacing single quotes with double quotes for JSON parsing,
+
-        make sure your tool call arguments don't ever include quotes!
+        Content and tool calls formatting depends on the Mistral's tokenizer version
        used to train the model:
        - < v11: `content[BOT] [{tool_call1},{tool_call2}]`
        - >= v11: `content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}`
        with [BOT] the tool call token.
        Note:
            For tokenizer versions >= v11, tool calls with arguments wrongly formatted
            are still returned as tool calls. This is to allow the model to know it
            tried to make a tool call. It reduces chance of another failure and
            prevents that the context is filled with tool calls wrongly placed in
            assistant message contents.
        """
-        # case -- if a tool call token is not present, return a text response
+        # If the tool call token is not present, return a text response
        if self.bot_token not in model_output:
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )
-        # first remove the BOT token
+        content_and_raw_tool_calls = model_output.split(self.bot_token)
-        tool_content = model_output.replace(self.bot_token, "").strip()
+        content = content_and_raw_tool_calls[0]
        raw_tool_calls = content_and_raw_tool_calls[1:]
-        try:
+        # >= v11: content[BOT]tool_name1{args_call1}[BOT]tool_name2{args_call2}
        if not self._is_pre_v11:
            tool_calls = []
            for raw_tool_call in raw_tool_calls:
                if "{" not in raw_tool_call:
                    continue
                end_name = raw_tool_call.find("{")
                tool_name, args = (
                    raw_tool_call[:end_name],
                    raw_tool_call[end_name:],
                )
                tool_calls.append({"name": tool_name, "arguments": args})
        # < v11: content[BOT] [{tool_call1},{tool_call2}]
        else:
            if len(raw_tool_calls) != 1:
                raise ValueError(
                    "Only one BOT token should have been outputted, "
                    f"but got {model_output}."
                )
            stringified_tool_calls = raw_tool_calls[0].strip()
            try:
-                if not self._is_pre_v11:
+                tool_calls = json.loads(stringified_tool_calls)
                    function_call_arr = []
                    for single_tool_content in model_output.split(self.bot_token):
                        if "{" not in single_tool_content:
                            continue
                        end_name = single_tool_content.find("{")
                        fn_name, args = (
                            single_tool_content[:end_name],
                            single_tool_content[end_name:],
                        )
                        # fn_name is encoded outside serialized json dump
                        # only arguments are serialized
                        function_call_arr.append(
                            {"name": fn_name, "arguments": json.loads(args)}
                        )
                else:
                    function_call_arr = json.loads(tool_content)
            except json.JSONDecodeError:
                # use a regex to find the part corresponding to the tool call.
                # NOTE: This use case should not happen if the model is trained
                # correctly. It's an easy possible fix so it's included, but
                # can be brittle for very complex / highly nested tool calls
-                raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
+                try:
-                function_call_arr = json.loads(raw_tool_call)
+                    raw_tool_call = self.tool_call_regex.findall(
-
+                        stringified_tool_calls
-            # Tool Call
+                    )[0]
-            tool_calls: list[MistralToolCall] = [
+                    tool_calls = json.loads(raw_tool_call)
-                MistralToolCall(
+                except (IndexError, json.JSONDecodeError):
-                    type="function",
+                    logger.exception("Error in extracting tool call from response: {e}")
-                    function=FunctionCall(
+                    # If raw decoding and decoding post regex rule fails, then just
-                        name=raw_function_call["name"],
+                    # return content.
-                        # function call args are JSON but as a string
+                    return ExtractedToolCallInformation(
-                        arguments=json.dumps(
+                        tools_called=False,
-                            raw_function_call["arguments"], ensure_ascii=False
+                        tool_calls=[],
                        content=stringified_tool_calls,
                    )
            else:
                tool_calls = [
                    {
                        "name": tool_call["name"],
                        "arguments": json.dumps(
                            tool_call["arguments"], ensure_ascii=False
                        ),
-                    ),
+                    }
-                )
+                    for tool_call in tool_calls
-                for raw_function_call in function_call_arr
+                ]
            ]
-            # get any content before  the tool call
+        mistral_tool_calls: list[MistralToolCall] = [
-            content = model_output.split(self.bot_token)[0]
+            MistralToolCall(
-            return ExtractedToolCallInformation(
+                type="function",
-                tools_called=True,
+                function=FunctionCall(
-                tool_calls=tool_calls,
+                    name=tool_call["name"],
-                content=content if len(content) > 0 else None,
+                    arguments=tool_call["arguments"],
                ),
            )
            for tool_call in tool_calls
        ]
-        except Exception:
+        return ExtractedToolCallInformation(
-            logger.exception("Error in extracting tool call from response.")
+            tools_called=True,
-            # return information to just treat the tool call as regular JSON
+            tool_calls=mistral_tool_calls,
-            return ExtractedToolCallInformation(
+            content=content if len(content) > 0 else None,
-                tools_called=False, tool_calls=[], content=tool_content
+        )
            )
    def extract_tool_calls_streaming(
        self,
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@ -330,19 +330,25 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
    rope_theta = getattr_iter(config, names, None, warn=True)
    names = ["partial_rotary_factor", "rotary_pct", "rotary_emb_fraction"]
    partial_rotary_factor = getattr_iter(config, names, None, warn=True)
    ompe = getattr(config, "original_max_position_embeddings", None)
    if Version(version("transformers")) < Version("5.0.0.dev0"):
        # Transformers v4 installed, legacy config fields may be present
        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
            config.rope_parameters = rope_scaling
        if (
-            rope_theta is not None or partial_rotary_factor is not None
+            rope_theta is not None
            or partial_rotary_factor is not None
            or ompe is not None
        ) and not getattr(config, "rope_parameters", None):
            config.rope_parameters = {"rope_type": "default"}
        # Patch legacy fields into rope_parameters
        if rope_theta is not None:
            config.rope_parameters["rope_theta"] = rope_theta
        if partial_rotary_factor is not None:
            config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
        if ompe is not None:
            config.rope_parameters["original_max_position_embeddings"] = ompe
    elif rope_theta is not None or getattr(config, "rope_parameters", None):
        # Transformers v5 installed
        # Patch these fields in case they used non-standard names
@ -358,10 +364,6 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
    if getattr(config, "rope_parameters", None) is None:
        return
    # Add original_max_position_embeddings if present
    if ompe := getattr(config, "original_max_position_embeddings", None):
        config.rope_parameters["original_max_position_embeddings"] = ompe
    # Handle nested rope_parameters in interleaved sliding attention models
    if is_rope_parameters_nested(config.rope_parameters):
        for rope_parameters_layer_type in config.rope_parameters.values():
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@ -184,18 +184,42 @@ def _remap_mistral_audio_args(config: dict) -> dict:
    whisper_args = config["multimodal"].pop("whisper_model_args")
    encoder_args = whisper_args["encoder_args"]
    downsample_args = whisper_args["downsample_args"]
    downsample_factor = downsample_args["downsample_factor"]
    # make sure that k/v blocks can be allocated with
    # unified k/v cache class and pool whisper k/v cache blocks
    # with downsample_factor:1 ratio
    if encoder_args.get("causal"):
        block_pool_size = downsample_factor
        config["projection_size"] = downsample_factor * encoder_args["dim"]
    else:
        block_pool_size = 1
    _maybe_sliding_window = encoder_args.get("ragged_attention", None)
    if _maybe_sliding_window is None:
        sliding_window = None
    elif _maybe_sliding_window.isdigit():
        sliding_window = int(_maybe_sliding_window)
    else:
        raise NotImplementedError(f"Unsupported: {_maybe_sliding_window=}")
    architecture = (
        "VoxtralStreamingGeneration"
        if encoder_args.get("causal")
        else "VoxtralForConditionalGeneration"
    )
    quant_config = config.get("quantization_config")
    config = {
-        "model_type": "whixtral",
+        "model_type": "voxtral",
-        "architectures": ["VoxtralForConditionalGeneration"],
+        "architectures": [architecture],
        "text_config": PretrainedConfig.from_dict(config),
        "audio_config": WhisperConfig(
            num_mel_bins=encoder_args["audio_encoding_args"]["num_mel_bins"],
            window_size=encoder_args["audio_encoding_args"]["window_size"],
            sampling_rate=encoder_args["audio_encoding_args"]["sampling_rate"],
            hop_length=encoder_args["audio_encoding_args"]["hop_length"],
-            downsample_factor=downsample_args["downsample_factor"],
+            downsample_factor=downsample_factor,
            d_model=encoder_args["dim"],
            encoder_layers=encoder_args["n_layers"],
            encoder_ffn_dim=encoder_args["hidden_dim"],
@ -203,6 +227,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
            vocab_size=encoder_args["vocab_size"],
            max_source_positions=encoder_args["max_source_positions"],
            is_encoder_decoder=False,  # Override WhisperConfig default
            is_causal=encoder_args.get("causal", False),
            sliding_window=sliding_window,
            block_pool_size=block_pool_size,
            pos_embed=encoder_args.get("pos_embed", "sinusoidal"),
        ),
    }
    if quant_config:
--- a/vllm/v1/attention/backends/mamba1_attn.py
+++ b/vllm/v1/attention/backends/mamba1_attn.py
@ -3,17 +3,11 @@
 from dataclasses import dataclass
 import torch
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.v1.attention.backends.mamba_attn import (
-from vllm.config import VllmConfig
+    BaseMambaAttentionMetadata,
-from vllm.v1.attention.backends.mamba_attn import BaseMambaAttentionMetadataBuilder
+    BaseMambaAttentionMetadataBuilder,
 from vllm.v1.attention.backends.utils import (
    CommonAttentionMetadata,
    split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
 class Mamba1AttentionBackend(AttentionBackend):
@ -23,137 +17,12 @@ class Mamba1AttentionBackend(AttentionBackend):
@dataclass
-class Mamba1AttentionMetadata:
+class Mamba1AttentionMetadata(BaseMambaAttentionMetadata):
-    query_start_loc_p: torch.Tensor
+    pass
    state_indices_tensor: torch.Tensor
    has_initial_states_p: torch.Tensor | None
    num_prefills: int
    num_prefill_tokens: int
    num_decodes: int
    num_decode_tokens: int
    block_idx_last_scheduled_token: torch.Tensor  # shape: [batch,]
    block_idx_first_scheduled_token_p: torch.Tensor  # shape: [batch,]
    block_idx_last_computed_token: torch.Tensor  # shape: [batch,]
    num_computed_tokens_p: torch.Tensor  # shape: [batch,]
 class Mamba1AttentionMetadataBuilder(
    BaseMambaAttentionMetadataBuilder[Mamba1AttentionMetadata]
 ):
-    def __init__(
+    metadata_cls = Mamba1AttentionMetadata
-        self,
+    supports_update_block_table: bool = False
        kv_cache_spec: AttentionSpec,
        layer_names: list[str],
        vllm_config: VllmConfig,
        device: torch.device,
    ):
        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
        assert isinstance(kv_cache_spec, MambaSpec)
    def build(
        self,
        common_prefix_len: int,
        common_attn_metadata: CommonAttentionMetadata,
        fast_build: bool = False,
    ) -> Mamba1AttentionMetadata:
        num_reqs = common_attn_metadata.num_reqs
        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
            split_decodes_and_prefills(
                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
            )
        )
        has_initial_states_p = None
        query_start_loc_p = None
        num_computed_tokens, num_computed_tokens_p = None, None
        block_idx_first_scheduled_token = None
        block_idx_first_scheduled_token_p = None
        # TODO(@Josephasafg) Mamba1 and Mamba2 have a lot of code in common here.
        # We should consolidate this code
        if self.vllm_config.cache_config.enable_prefix_caching:
            # Return a tensor of shape (#requests, #max blocks)
            state_indices_tensor = common_attn_metadata.block_table_tensor
            mamba_block_size = self.kv_cache_spec.block_size
            num_computed_tokens = common_attn_metadata.num_computed_tokens_cpu.to(
                self.device
            )
            (
                block_idx_last_computed_token,
                block_idx_first_scheduled_token,
                block_idx_last_scheduled_token,
            ) = self._compute_prefix_caching_block_indices(
                common_attn_metadata, mamba_block_size
            )
        else:
            # Always return just a single block per each request:
            state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
            block_idx_last_scheduled_token = None
            block_idx_last_computed_token = None
        if num_prefills > 0:
            query_start_loc_p = (
                common_attn_metadata.query_start_loc[-num_prefills - 1 :]
                - num_decode_tokens
            )
            has_initial_states_cpu = (
                common_attn_metadata.num_computed_tokens_cpu[
                    num_reqs - num_prefills : num_reqs
                ]
                > 0
            )
            has_initial_states_p = has_initial_states_cpu.to(
                common_attn_metadata.query_start_loc.device
            )
            if self.vllm_config.cache_config.enable_prefix_caching:
                assert num_computed_tokens is not None
                num_computed_tokens_p = num_computed_tokens[
                    num_reqs - num_prefills : num_reqs
                ]
                assert block_idx_first_scheduled_token is not None
                block_idx_first_scheduled_token_p = block_idx_first_scheduled_token[
                    num_reqs - num_prefills : num_reqs
                ]
        elif (
            num_decodes > 0
            and num_decodes <= self.decode_cudagraph_max_bs
            and self.compilation_config.cudagraph_mode.has_full_cudagraphs()
        ):
            self.state_indices_tensor[:num_decodes].copy_(
                state_indices_tensor, non_blocking=True
            )
            state_indices_tensor = self.state_indices_tensor[:num_decode_tokens]
            state_indices_tensor[num_decodes:] = PAD_SLOT_ID
            if self.vllm_config.cache_config.enable_prefix_caching:
                self.block_idx_last_scheduled_token[:num_decodes].copy_(
                    block_idx_last_scheduled_token, non_blocking=True
                )
                block_idx_last_scheduled_token = self.block_idx_last_scheduled_token[
                    :num_decode_tokens
                ]
                self.block_idx_last_computed_token[:num_decodes].copy_(
                    block_idx_last_computed_token, non_blocking=True
                )
                block_idx_last_computed_token = self.block_idx_last_computed_token[
                    :num_decode_tokens
                ]
        return Mamba1AttentionMetadata(
            query_start_loc_p=query_start_loc_p,
            has_initial_states_p=has_initial_states_p,
            state_indices_tensor=state_indices_tensor,
            num_prefills=num_prefills,
            num_prefill_tokens=num_prefill_tokens,
            num_decodes=num_decodes,
            num_decode_tokens=num_decode_tokens,
            block_idx_last_scheduled_token=block_idx_last_scheduled_token,
            block_idx_first_scheduled_token_p=block_idx_first_scheduled_token_p,
            block_idx_last_computed_token=block_idx_last_computed_token,
            num_computed_tokens_p=num_computed_tokens_p,
        )
--- a/Show More
+++ b/Show More
`@ -2,4 +2,4 @@`

	`vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.`	`vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.`

	`Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.`	You can use vLLM with KServe's [Hugging Face serving runtime](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) or via [`LLMInferenceService` that uses llm-d](https://kserve.github.io/website/docs/model-serving/generative-inference/llmisvc/llmisvc-overview).