diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f1cd39ef4f948..e444becd9867b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -192,6 +192,7 @@ steps: # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py @@ -970,6 +971,7 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py + - vllm/model_executor/layers/fused_moe/layer.py - tests/compile/test_fusion_attn.py - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py @@ -1116,6 +1118,7 @@ steps: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py diff --git a/CMakeLists.txt b/CMakeLists.txt index a4cf51d17e982..86746a0db4c0e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,7 +136,7 @@ elseif(HIP_FOUND) # ROCm 5.X and 6.X if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM}) + Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM}) message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} " "expected for ROCm build, saw ${Torch_VERSION} instead.") endif() diff --git a/csrc/cache.h b/csrc/cache.h index b162a4a2bc31f..f2a5ec0acf5cd 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, const double scale, const std::string& kv_cache_dtype); void gather_and_maybe_dequant_cache( - torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] - torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] - torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] - torch::Tensor const& cu_seq_lens, // [BATCH+1] - int64_t batch_size, const std::string& kv_cache_dtype, + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS] + int64_t num_tokens, const std::string& kv_cache_dtype, torch::Tensor const& scale, std::optional seq_starts = std::nullopt); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 32960cc8073bb..8a5457206c706 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -905,91 +905,79 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, namespace vllm { // grid is launched with dimensions (batch, num_splits) -template +template __global__ void gather_and_maybe_dequant_cache( - const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, - // ENTRIES...] - scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...] - const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] - const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] - const int32_t block_size, const int32_t entry_size, + const cache_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE, + // ENTRIES...] + scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...] + const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES] + const int32_t* __restrict__ cu_seq_lens, // [BATCH+1] + const int32_t* __restrict__ token_to_seq, // [MAX_TOKEN_ACROSS_CHUNK] + const int32_t num_tokens, const int32_t block_size, const int64_t block_table_stride, const int64_t cache_block_stride, const int64_t cache_entry_stride, const int64_t dst_entry_stride, const float* __restrict__ scale, const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per // batch + constexpr int vec_size = sizeof(float4) / sizeof(scalar_t); + using ltype = vllm::vec_n_t; + using stype = vllm::vec_n_t; + // We are adding this for code readability which will be optimized out when + // build in release. + assert(CTA_SIZE == blockDim.x); - const int64_t bid = blockIdx.x; // Batch ID - const int32_t num_splits = gridDim.y; - const int32_t split = blockIdx.y; - const int32_t seq_start = cu_seq_lens[bid]; - const int32_t seq_end = cu_seq_lens[bid + 1]; - const int32_t seq_len = seq_end - seq_start; - const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size); - const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits); +#pragma unroll + for (int token_id = blockIdx.x; token_id < num_tokens; + token_id += gridDim.x) { + int64_t batch_id = token_to_seq[token_id]; + int64_t batch_start = cu_seq_lens[batch_id]; + int64_t batch_end = cu_seq_lens[batch_id + 1]; + int32_t batch_offset = token_id - batch_start; - const int32_t split_start = split * split_blocks; - const int32_t split_end = min((split + 1) * split_blocks, tot_blocks); + if (token_id >= batch_end) return; + int32_t offset = 0; + if (seq_starts != nullptr) { + offset = seq_starts[batch_id]; + } + batch_offset += offset; + int32_t block_table_id = batch_offset / block_size; + int32_t slot_id = batch_offset % block_size; + int32_t block_table_offset = batch_id * block_table_stride + block_table_id; + int32_t block_id = block_table[block_table_offset]; + int64_t cache_offset = + block_id * cache_block_stride + slot_id * cache_entry_stride; + constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size; + scalar_t* dst_ = dst + token_id * dst_entry_stride; + cache_t* src_ = const_cast(src_cache) + cache_offset; - const bool is_active_split = (split_start < tot_blocks); - const bool is_last_split = (split_end == tot_blocks); - - if (!is_active_split) return; - - int32_t full_blocks_end = split_end; - int32_t partial_block_size = 0; - - // Adjust the pointer for the block_table for this batch. - // If seq_starts is provided, compute an offset based on (seq_starts[bid] / - // page_size) - const int32_t batch_offset = bid * block_table_stride; - int32_t offset = 0; - if (seq_starts != nullptr) { - offset = seq_starts[bid] / block_size; - } - const int32_t* batch_block_table = block_table + batch_offset + offset; - - // Adjust dst pointer based on the cumulative sequence lengths. - dst += seq_start * dst_entry_stride; - - if (is_last_split) { - partial_block_size = seq_len % block_size; - if (partial_block_size) full_blocks_end -= 1; - } - - auto copy_entry = [&](const cache_t* __restrict__ _src, - scalar_t* __restrict__ _dst) { - for (int i = threadIdx.x; i < entry_size; i += blockDim.x) { +#pragma unroll + for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) { if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { - _dst[i] = static_cast(_src[i]); + reinterpret_cast(dst_)[idx] = + static_cast(reinterpret_cast(src_)[idx]); } else { - _dst[i] = - fp8::scaled_convert(_src[i], *scale); + ltype loaded_val = reinterpret_cast(src_)[idx]; + stype store_val; +#pragma unroll + for (int j = 0; j < vec_size; ++j) { + store_val.val[j] = fp8::scaled_convert( + loaded_val.val[j], *scale); + } + reinterpret_cast(dst_)[idx] = store_val; } } - }; - - const auto loop_end = - std::min((int64_t)full_blocks_end, block_table_stride - offset); - for (int pid = split_start; pid < loop_end; ++pid) { - auto block_id = batch_block_table[pid]; - auto block_start_ptr = src_cache + block_id * cache_block_stride; - auto block_dst_ptr = dst + pid * block_size * dst_entry_stride; - for (int eid = 0; eid < block_size; ++eid) { - copy_entry(block_start_ptr + eid * cache_entry_stride, - block_dst_ptr + eid * dst_entry_stride); - } - } - - if (partial_block_size) { - if (offset + full_blocks_end < block_table_stride) { - auto block_id = batch_block_table[full_blocks_end]; - auto block_start_ptr = src_cache + block_id * cache_block_stride; - auto block_dst_ptr = - dst + full_blocks_end * block_size * dst_entry_stride; - for (int eid = 0; eid < partial_block_size; ++eid) { - copy_entry(block_start_ptr + eid * cache_entry_stride, - block_dst_ptr + eid * dst_entry_stride); + // process tail + constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size; + dst_ = dst_ + ENTRY_SIZE - tail_cnt; + src_ = src_ + ENTRY_SIZE - tail_cnt; +#pragma unroll + for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) { + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + dst_[idx] = static_cast(src_[idx]); + } else { + dst_[idx] = + fp8::scaled_convert(src_[idx], *scale); } } } @@ -1001,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache( // SCALAR_T is the data type of the destination tensor. // CACHE_T is the stored data type of kv-cache. // KV_DTYPE is the real data type of kv-cache. -#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \ - vllm::gather_and_maybe_dequant_cache \ - <<>>( \ - reinterpret_cast(src_cache.data_ptr()), \ - reinterpret_cast(dst.data_ptr()), \ - block_table.data_ptr(), cu_seq_lens.data_ptr(), \ - block_size, entry_size, block_table_stride, cache_block_stride, \ - cache_entry_stride, dst_entry_stride, \ - reinterpret_cast(scale.data_ptr()), seq_starts_ptr); +#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE) \ + vllm::gather_and_maybe_dequant_cache \ + <<>>( \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst.data_ptr()), \ + block_table.data_ptr(), cu_seq_lens.data_ptr(), \ + token_to_seq.data_ptr(), num_tokens, block_size, \ + block_table_stride, cache_block_stride, cache_entry_stride, \ + dst_entry_stride, reinterpret_cast(scale.data_ptr()), \ + seq_starts_ptr); // Gather sequences from the cache into the destination tensor. // - cu_seq_lens contains the cumulative sequence lengths for each batch // - block_table contains the cache block indices for each sequence +// - token_to_seq contains the back mapping from token_id to batch_id // - Optionally, seq_starts (if provided) offsets the starting block index by // (seq_starts[bid] / page_size) void gather_and_maybe_dequant_cache( - torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] - torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] - torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] - torch::Tensor const& cu_seq_lens, // [BATCH+1] - int64_t batch_size, const std::string& kv_cache_dtype, + torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...] + torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...] + torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES] + torch::Tensor const& cu_seq_lens, // [BATCH+1] + torch::Tensor const& token_to_seq, // [MAX_TOKEN_ACROSS_CHUNKS] + int64_t num_tokens, const std::string& kv_cache_dtype, torch::Tensor const& scale, std::optional seq_starts = std::nullopt) { at::cuda::OptionalCUDAGuard device_guard(src_cache.device()); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); int32_t block_size = src_cache.size(1); - int32_t entry_size = src_cache.flatten(2, -1).size(2); + int32_t head_dim = dst.size(-1); TORCH_CHECK(block_table.dtype() == torch::kInt32, "block_table must be int32"); @@ -1038,6 +1030,9 @@ void gather_and_maybe_dequant_cache( TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32, "seq_starts must be int32"); } + TORCH_CHECK(head_dim == 576, + "gather_and_maybe_dequant_cache only support the head_dim to 576 " + "for better performance") TORCH_CHECK(src_cache.device() == dst.device(), "src_cache and dst must be on the same device"); @@ -1055,10 +1050,9 @@ void gather_and_maybe_dequant_cache( int64_t cache_entry_stride = src_cache.stride(1); int64_t dst_entry_stride = dst.stride(0); - // Decide on the number of splits based on the batch size. - int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; - dim3 grid(batch_size, num_splits); - dim3 block(1024); + constexpr int32_t thread_block_size = 64; + dim3 grid(num_tokens); + dim3 block(thread_block_size); const int32_t* seq_starts_ptr = seq_starts.has_value() ? seq_starts.value().data_ptr() : nullptr; diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 5af74c2c2a6b0..14913bef13125 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -695,7 +695,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { cache_ops.def( "gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, " " Tensor block_table, Tensor cu_seq_lens, " - " int batch_size, " + " Tensor token_to_seq, " + " int num_tokens, " " str kv_cache_dtype, " " Tensor scale, Tensor? seq_starts) -> ()"); cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA, diff --git a/docker/Dockerfile b/docker/Dockerfile index 1b937bbc1225e..84a1802dbe03a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12 # glibc version is baked into the distro, and binaries built with one glibc # version are not backwards compatible with OSes that use an earlier version. ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -# TODO: Restore to base image after FlashInfer AOT wheel fixed -ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 +# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels) +ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 # By parameterizing the Deadsnakes repository URL, we allow third-party to use # their own mirror. When doing so, we don't benefit from the transparent @@ -85,7 +85,20 @@ ARG GET_PIP_URL RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && apt-get update -y \ - && apt-get install -y ccache software-properties-common git curl sudo python3-pip libibverbs-dev \ + && apt-get install -y --no-install-recommends \ + ccache \ + software-properties-common \ + git \ + curl \ + sudo \ + python3-pip \ + libibverbs-dev \ + # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 + # as it was causing spam when compiling the CUTLASS kernels + gcc-10 \ + g++-10 \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \ + && rm -rf /var/lib/apt/lists/* \ && curl -LsSf https://astral.sh/uv/install.sh | sh \ && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \ && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \ @@ -110,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy -# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 -# as it was causing spam when compiling the CUTLASS kernels -RUN apt-get install -y gcc-10 g++-10 -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 RUN < /dev/null && \ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ @@ -25,10 +25,14 @@ RUN apt clean && apt-get update -y && \ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 -RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing +RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc + +# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2. +RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh +RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \ + echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \ + echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc -RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh -RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc SHELL ["bash", "-c"] CMD ["bash", "-c", "source /root/.bashrc && exec bash"] @@ -72,6 +76,7 @@ RUN python3 -m pip install -e tests/vllm_test_utils ENV NIXL_VERSION=0.7.0 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py +# remove torch bundled oneccl to avoid conflicts RUN --mount=type=cache,target=/root/.cache/pip \ pip uninstall oneccl oneccl-devel -y diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png index 57a33524a5169..b327eb2151f50 100644 Binary files a/docs/assets/contributing/dockerfile-stages-dependency.png and b/docs/assets/contributing/dockerfile-stages-dependency.png differ diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md index 5e86e9388f328..9875bc44c9144 100644 --- a/docs/features/quantization/inc.md +++ b/docs/features/quantization/inc.md @@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8 ``` -!!! tip - If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop. - !!! tip When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables: `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes. diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index e38627c707884..7d52891bea7b9 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -7,7 +7,7 @@ This document shows you some examples of the different options that are available to generate structured outputs. !!! warning - If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document: + If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document: - `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)` - `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)` diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 404519f887dc6..25579835faf63 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -680,6 +680,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + IE+ + VE+ | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | +| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + IE+ | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | | `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py old mode 100644 new mode 100755 index 04e6f99f8957e..df6e96ca375fc --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -425,6 +425,13 @@ def parse_args(): default=None, help="Set the seed when initializing `vllm.LLM`.", ) + parser.add_argument( + "--tensor-parallel-size", + "-tp", + type=int, + default=None, + help="Tensor parallel size to override the model's default setting. ", + ) return parser.parse_args() @@ -434,6 +441,12 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") + if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1: + raise ValueError( + f"tensor_parallel_size must be a positive integer, " + f"got {args.tensor_parallel_size}" + ) + audio_count = args.num_audios req_data = model_example_map[model]( question_per_audio_count[audio_count], audio_count @@ -446,6 +459,8 @@ def main(args): ) engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + if args.tensor_parallel_size is not None: + engine_args["tensor_parallel_size"] = args.tensor_parallel_size llm = LLM(**engine_args) # We set temperature to 0.2 so that outputs can be different diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py new file mode 100644 index 0000000000000..88a61ed694c2e --- /dev/null +++ b/examples/offline_inference/qwen3_omni/only_thinker.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on Qwen2.5-Omni (thinker only). +""" + +from typing import NamedTuple + +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm.multimodal.image import convert_image_mode +from vllm.utils.argparse_utils import FlexibleArgumentParser + + +class QueryResult(NamedTuple): + inputs: dict + limit_mm_per_prompt: dict[str, int] + + +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. + +default_system = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba " + "Group, capable of perceiving auditory and visual inputs, as well as " + "generating text and speech." +) + + +def get_mixed_modalities_query() -> QueryResult: + question = ( + "What is recited in the audio? " + "What is the content of this image? Why is this video funny?" + ) + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>" + "<|vision_start|><|image_pad|><|vision_end|>" + "<|vision_start|><|video_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": { + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + "image": convert_image_mode( + ImageAsset("cherry_blossom").pil_image, "RGB" + ), + "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, + }, + }, + limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1}, + ) + + +def get_use_audio_in_video_query() -> QueryResult: + question = ( + "Describe the content of the video in details, then convert what the " + "baby say into text." + ) + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) + asset = VideoAsset(name="baby_reading", num_frames=16) + audio = asset.get_audio(sampling_rate=16000) + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": { + "video": asset.np_ndarrays, + "audio": audio, + }, + "mm_processor_kwargs": { + "use_audio_in_video": True, + }, + }, + limit_mm_per_prompt={"audio": 1, "video": 1}, + ) + + +def get_multi_audios_query() -> QueryResult: + question = "Are these two audio clips the same?" + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>" + "<|audio_start|><|audio_pad|><|audio_end|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": { + "audio": [ + AudioAsset("winning_call").audio_and_sample_rate, + AudioAsset("mary_had_lamb").audio_and_sample_rate, + ], + }, + }, + limit_mm_per_prompt={ + "audio": 2, + }, + ) + + +query_map = { + "mixed_modalities": get_mixed_modalities_query, + "use_audio_in_video": get_use_audio_in_video_query, + "multi_audios": get_multi_audios_query, +} + + +def main(args): + model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct" + query_result = query_map[args.query_type]() + + llm = LLM( + model=model_name, + max_model_len=12800, + max_num_seqs=5, + limit_mm_per_prompt=query_result.limit_mm_per_prompt, + seed=args.seed, + ) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, max_tokens=256) + + outputs = llm.generate(query_result.inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +def parse_args(): + parser = FlexibleArgumentParser( + description="Demo on using vLLM for offline inference with " + "audio language models" + ) + parser.add_argument( + "--query-type", + "-q", + type=str, + default="mixed_modalities", + choices=query_map.keys(), + help="Query type.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py old mode 100644 new mode 100755 index 624de2a2debc3..8f72bf6f0b0d1 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: ) +# HunyuanOCR +def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + + model_name = "tencent/HunyuanOCR" + + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + limit_mm_per_prompt={modality: 1}, + ) + + placeholder = "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501 + prompts = [ + f"<|hy_begin▁of▁sentence|>{placeholder}{question}<|hy_User|>" + for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=None, + ) + + # naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B def run_hyperclovax_seed_vision( questions: list[str], modality: str @@ -1820,6 +1845,7 @@ model_example_map = { "glm4_5v": run_glm4_5v, "glm4_5v_fp8": run_glm4_5v_fp8, "h2ovl_chat": run_h2ovl, + "hunyuan_vl": run_hunyuan_vl, "hyperclovax_seed_vision": run_hyperclovax_seed_vision, "idefics3": run_idefics3, "interns1": run_interns1, @@ -2038,6 +2064,13 @@ def parse_args(): help="If True, will send all requests in a second batch with empty mm " "data to verify cache hits with UUIDs.", ) + parser.add_argument( + "--tensor-parallel-size", + "-tp", + type=int, + default=None, + help="Tensor parallel size to override the model's default setting. ", + ) return parser.parse_args() @@ -2046,6 +2079,12 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") + if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1: + raise ValueError( + f"tensor_parallel_size must be a positive integer, " + f"got {args.tensor_parallel_size}" + ) + modality = args.modality mm_input = get_multi_modal_input(args) data = mm_input["data"] @@ -2063,6 +2102,8 @@ def main(args): "seed": args.seed, "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4, } + if args.tensor_parallel_size is not None: + engine_args["tensor_parallel_size"] = args.tensor_parallel_size llm = LLM(**engine_args) # Don't want to check the flag multiple times, so just hijack `prompts`. diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py old mode 100644 new mode 100755 index 301265d4e17f7..7ba4e64b567de --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1352,10 +1352,18 @@ model_example_map = { } -def run_generate(model, question: str, image_urls: list[str], seed: int | None): +def run_generate( + model, + question: str, + image_urls: list[str], + seed: int | None, + tensor_parallel_size: int | None, +): req_data = model_example_map[model](question, image_urls) - engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + engine_args = asdict(req_data.engine_args) | {"seed": seed} + if tensor_parallel_size is not None: + engine_args["tensor_parallel_size"] = tensor_parallel_size llm = LLM(**engine_args) sampling_params = SamplingParams( @@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None): print("-" * 50) -def run_chat(model: str, question: str, image_urls: list[str], seed: int | None): +def run_chat( + model: str, + question: str, + image_urls: list[str], + seed: int | None, + tensor_parallel_size: int | None, +): req_data = model_example_map[model](question, image_urls) # Disable other modalities to save memory @@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None) ) engine_args = asdict(req_data.engine_args) | {"seed": seed} + if tensor_parallel_size is not None: + engine_args["tensor_parallel_size"] = tensor_parallel_size llm = LLM(**engine_args) sampling_params = ( @@ -1463,6 +1479,13 @@ def parse_args(): default=2, help="Number of images to use for the demo.", ) + parser.add_argument( + "--tensor-parallel-size", + "-tp", + type=int, + default=None, + help="Tensor parallel size to override the model's default setting. ", + ) return parser.parse_args() @@ -1470,13 +1493,20 @@ def main(args: Namespace): model = args.model_type method = args.method seed = args.seed + tensor_parallel_size = args.tensor_parallel_size + + if tensor_parallel_size is not None and tensor_parallel_size < 1: + raise ValueError( + f"tensor_parallel_size must be a positive integer, " + f"got {tensor_parallel_size}" + ) image_urls = IMAGE_URLS[: args.num_images] if method == "generate": - run_generate(model, QUESTION, image_urls, seed) + run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size) elif method == "chat": - run_chat(model, QUESTION, image_urls, seed) + run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size) else: raise ValueError(f"Invalid method: {method}") diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt index b1f3269cd3813..083230c171096 100644 --- a/requirements/kv_connectors.txt +++ b/requirements/kv_connectors.txt @@ -1,2 +1,2 @@ lmcache -nixl >= 0.6.0 # Required for disaggregated prefill +nixl >= 0.7.1 # Required for disaggregated prefill diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 2d57e7e167869..8a91b59de6f72 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -45,3 +45,7 @@ multiprocess==0.70.16 # Plugins test terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e +torchgeo==0.7.0 + +# Required for suffix decoding test +arctic-inference == 0.1.1 diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 59ea710684a2c..c1dc4195b5231 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -10,9 +10,9 @@ wheel jinja2>=3.1.6 datasets # for benchmark scripts numba == 0.61.2 # Required for N-gram speculative decoding -torch==2.8.0+xpu +--extra-index-url=https://download.pytorch.org/whl/xpu +torch==2.9.0+xpu torchaudio torchvision ---extra-index-url=https://download.pytorch.org/whl/xpu -intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 661172e1965b5..53c3f875d2003 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -111,6 +111,17 @@ if current_platform.is_cuda(): async_tp=96, # MLP is MoE, half the fusions of dense ), ), + ModelBackendTestCase( + model_name="openai/gpt-oss-20b", + model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), + backend=AttentionBackendEnum.FLASHINFER, + matches=Matches( + attention_fusion=0, + allreduce_fusion=49, + sequence_parallel=49, + async_tp=48, + ), + ), ] elif current_platform.is_rocm(): diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 4e7b765d7713f..65a6fd20bd0d1 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -183,9 +183,6 @@ async def test_metrics_counts( EXPECTED_METRICS_V1 = [ "vllm:num_requests_running", "vllm:num_requests_waiting", - "vllm:gpu_cache_usage_perc", - "vllm:gpu_prefix_cache_queries", - "vllm:gpu_prefix_cache_hits", "vllm:kv_cache_usage_perc", "vllm:prefix_cache_queries", "vllm:prefix_cache_hits", diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 6251e1776c30a..8fd3545eccffa 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import importlib import json import time @@ -35,6 +35,10 @@ GET_WEATHER_SCHEMA = { @pytest.fixture(scope="module") def server(): + assert importlib.util.find_spec("gpt_oss") is not None, ( + "Harmony tests require gpt_oss package to be installed" + ) + args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"] env_dict = dict( VLLM_ENABLE_RESPONSES_API_STORE="1", diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 028e164cb801b..acf46d75d62eb 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -921,12 +921,16 @@ def test_gather_and_maybe_dequant_cache_mla( ) _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype) - seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device) + seq_len_tensor = torch.randint( + max_seq_len, max_seq_len + 1, (batch_size,), device=device + ) total_tokens = seq_len_tensor.sum() cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device) cu_seq_lens[0] = 0 cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32) + token_to_seq = torch.arange(0, batch_size, dtype=torch.int32, device=device) + token_to_seq = torch.repeat_interleave(token_to_seq, seq_len_tensor) print("seq_len_tensor", seq_len_tensor) tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size @@ -977,7 +981,8 @@ def test_gather_and_maybe_dequant_cache_mla( dst, block_table, cu_seq_lens, - batch_size, + token_to_seq, + total_tokens, kv_cache_dtype, scale, None, @@ -990,7 +995,8 @@ def test_gather_and_maybe_dequant_cache_mla( dst, block_table, cu_seq_lens, - batch_size, + token_to_seq, + total_tokens, kv_cache_dtype, scale, None, diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 638741e91619b..a6977f222408d 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -11,7 +11,6 @@ from vllm.model_executor.layers.fused_moe.config import ( fp8_w8a8_moe_quant_config, ) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts -from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( apply_flashinfer_per_tensor_scale_fp8, flashinfer_cutlass_moe_fp8, @@ -151,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True) score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids = Llama4MoE.custom_routing_function( hidden_states=td.hidden_states, - router_logits=score, - use_grouped_topk=False, - top_k=topk, + gating_output=score, + topk=topk, renormalize=False, - custom_routing_function=Llama4MoE.custom_routing_function, - scoring_func="softmax", ) quant_config = fp8_w8a8_moe_quant_config( @@ -219,14 +215,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( ) score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids = Llama4MoE.custom_routing_function( hidden_states=td.hidden_states, - router_logits=score, - use_grouped_topk=False, - top_k=topk, + gating_output=score, + topk=topk, renormalize=False, - custom_routing_function=Llama4MoE.custom_routing_function, - scoring_func="softmax", ) quant_config = fp8_w8a8_moe_quant_config( diff --git a/tests/model_executor/test_qwen3_omni.py b/tests/model_executor/test_qwen3_omni.py new file mode 100644 index 0000000000000..c92c61dcd3bc2 --- /dev/null +++ b/tests/model_executor/test_qwen3_omni.py @@ -0,0 +1,221 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import Mock + +import pytest +from transformers import PretrainedConfig + +from vllm.multimodal.processing import InputProcessingContext + + +# Helper function to print input IDs with coalesced audio/video tokens. +def print_input_ids(input_ids): + """ + Print input IDs, compressing consecutive special tokens. + - 151675: <|audio_pad|> + - 151656: <|video_pad|> + """ + if not input_ids: + print("[]") + return + + result = [] + i = 0 + + while i < len(input_ids): + current_id = input_ids[i] + + # Check if it's a special token that should be compressed + if current_id in [151675, 151656]: + # Count consecutive occurrences + count = 1 + while i + count < len(input_ids) and input_ids[i + count] == current_id: + count += 1 + + # Add compressed representation + token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>" + result.append(f"{token_name} * {count}") + i += count + else: + # Regular token, just add it + result.append(str(current_id)) + i += 1 + + print(", ".join(result)) + + +@pytest.fixture +def mock_qwen3_omni_config(): + """Create a mock Qwen3OmniMoeThinker config.""" + config = Mock(spec=PretrainedConfig) + # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json + config.audio_token_id = 151675 # <|audio_pad|> + config.video_token_id = 151656 # <|video_pad|> + config.image_token_id = 151655 # <|image_pad|> + config.audio_start_token_id = 151669 # <|audio_start|> + config.audio_end_token_id = 151670 # <|audio_end|> + config.vision_start_token_id = 151652 # <|vision_start|> + config.position_id_per_seconds = 12.5 + + # Vision config + vision_config = Mock() + vision_config.spatial_merge_size = 2 + config.vision_config = vision_config + + return config + + +@pytest.fixture +def mock_processor(): + """Create a mock HF processor.""" + from transformers.models.whisper import WhisperFeatureExtractor + + processor = Mock() + processor.audio_token = "<|audio_pad|>" + processor.image_token = "<|image_pad|>" + processor.video_token = "<|video_pad|>" + + # Create a real WhisperFeatureExtractor instance for the feature_extractor attribute + feature_extractor = WhisperFeatureExtractor() + processor.feature_extractor = feature_extractor + + return processor + + +@pytest.fixture +def mock_tokenizer(): + """Create a mock tokenizer.""" + tokenizer = Mock() + # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json + tokenizer.get_vocab = Mock( + return_value={ + "<|audio_pad|>": 151675, + "<|video_pad|>": 151656, + "<|image_pad|>": 151655, + "<|audio_start|>": 151669, + "<|audio_end|>": 151670, + "<|vision_start|>": 151652, + "<|vision_end|>": 151653, + } + ) + tokenizer.encode = Mock( + side_effect=lambda x: { + "<|vision_start|>": [151652], + "<|vision_end|>": [151653], + "<|audio_start|>": [151669], + "<|audio_end|>": [151670], + "<|audio_pad|>": [151675], + "<|image_pad|>": [151655], + "<|video_pad|>": [151656], + }.get(x, [0]) + ) + tokenizer.vision_bos_token = "<|vision_start|>" + tokenizer.vision_eos_token = "<|vision_end|>" + tokenizer.audio_bos_token = "<|audio_start|>" + tokenizer.audio_eos_token = "<|audio_end|>" + return tokenizer + + +@pytest.fixture +def mock_image_processor(): + """Create a mock image processor.""" + image_processor = Mock() + image_processor.merge_size = 2 + return image_processor + + +def test_qwen3_omni_get_updates_use_audio_in_video( + mock_qwen3_omni_config, + mock_processor, + mock_tokenizer, + mock_image_processor, +): + """Test the get_updates_use_audio_in_video method directly.""" + + from vllm.model_executor.models.qwen3_omni_moe_thinker import ( + Qwen3OmniMoeThinkerMultiModalProcessor, + Qwen3OmniMoeThinkerProcessingInfo, + ) + + # Create a mock context + mock_ctx = Mock(spec=InputProcessingContext) + + # Create processing info + info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx) + info.get_hf_config = Mock(return_value=mock_qwen3_omni_config) + info.get_hf_processor = Mock(return_value=mock_processor) + info.get_tokenizer = Mock(return_value=mock_tokenizer) + info.get_image_processor = Mock(return_value=mock_image_processor) + + # Create a mock dummy_inputs builder + mock_dummy_inputs = Mock() + + # Create the processor + processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs) + + # Test parameters from reference video + # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4 + audio_len = 85 + video_grid_thw = [6, 36, 64] + video_second_per_grid_t = 2.0 + + # Call the method + updates = processor.get_updates_use_audio_in_video( + thinker_config=mock_qwen3_omni_config, + audio_len=audio_len, + video_grid_thw=video_grid_thw, + video_second_per_grid_t=video_second_per_grid_t, + ) + + # Updated input ids should align with HF implementation. + # 151669, + # <|video_pad|> * 576, <|audio_pad|> * 25, + # <|video_pad|> * 576, <|audio_pad|> * 25, + # <|video_pad|> * 576, <|audio_pad|> * 25, + # <|video_pad|> * 576, <|audio_pad|> * 10, + # <|video_pad|> * 1152, + # 151670 + print_input_ids(updates) + + # Verify structure + assert isinstance(updates, list) + assert len(updates) > 0 + + # Verify start and end tokens + audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id + audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id + + assert updates[0] == audio_start_token_id + assert updates[-1] == audio_end_token_id + + # Verify both audio and video tokens are present + audio_token_id = mock_qwen3_omni_config.audio_token_id + video_token_id = mock_qwen3_omni_config.video_token_id + + audio_count = updates.count(audio_token_id) + video_count = updates.count(video_token_id) + + assert audio_count == audio_len, ( + f"Expected {audio_len} audio tokens, got {audio_count}" + ) + + # Calculate expected video token count + spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size + height = video_grid_thw[1] // spatial_merge_size + width = video_grid_thw[2] // spatial_merge_size + expected_video_count = video_grid_thw[0] * height * width + + assert video_count == expected_video_count, ( + f"Expected {expected_video_count} video tokens, got {video_count}" + ) + + # Total tokens should be: 1 (start) + audio_len + video_count + 1 (end) + expected_total = 1 + audio_len + expected_video_count + 1 + assert len(updates) == expected_total, ( + f"Expected {expected_total} total tokens, got {len(updates)}" + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 758ec54493aa3..f8b3470e6d39b 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -626,6 +626,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B", trust_remote_code=True, ), + "HunYuanVLForConditionalGeneration": _HfExamplesInfo( + "tencent/HunyuanOCR", + is_available_online=False, + ), "Idefics3ForConditionalGeneration": _HfExamplesInfo( "HuggingFaceM4/Idefics3-8B-Llama3", extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py index 5a162fa8f791b..e8826eb441a24 100644 --- a/tests/test_routing_simulator.py +++ b/tests/test_routing_simulator.py @@ -9,9 +9,16 @@ different routing strategies and analyze their performance, including integration tests with FusedMoE layer. """ +import tempfile + import pytest import torch +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.distributed import ( + init_distributed_environment, + initialize_model_parallel, +) from vllm.model_executor.layers.fused_moe.routing_simulator import ( DistributionBasedRouting, RoutingSimulator, @@ -89,6 +96,28 @@ def test_routing_strategy_integration(monkeypatch, device): # Test different routing strategies strategies = RoutingSimulator.get_available_strategies() + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + temp_file = tempfile.mkstemp()[1] + init_distributed_environment( + world_size=1, + rank=0, + local_rank=0, + distributed_init_method=f"file://{temp_file}", + ) + initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + ) + fused_moe = FusedMoE( + num_experts=num_experts, + top_k=top_k, + hidden_size=hidden_size, + intermediate_size=0, + use_grouped_topk=False, + renormalize=True, + ) + for strategy in strategies: # Set environment variable env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY" @@ -98,13 +127,9 @@ def test_routing_strategy_integration(monkeypatch, device): envs.environment_variables[env_name] = lambda s=strategy: s # Test the select_experts method - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = fused_moe.select_experts( hidden_states=hidden_states, router_logits=router_logits, - top_k=top_k, - use_grouped_topk=False, - renormalize=True, - indices_type=torch.long, ) # Verify output shapes diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py index 4ec79e9eb6ba4..80158d4b7278c 100644 --- a/tests/v1/attention/test_rocm_attention_backends_selection.py +++ b/tests/v1/attention/test_rocm_attention_backends_selection.py @@ -36,6 +36,12 @@ def mock_on_gfx9(): @pytest.mark.parametrize( "env_vars, selected_backend, expected_backend_path", [ + # Test Case: Explicit FLEX_ATTENTION backend + ( + {}, + "FLEX_ATTENTION", + AttentionBackendEnum.FLEX_ATTENTION.get_path(), + ), # Test Case 1: Default (no env vars, no explicit backend) ( {}, diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index dea89babd4b47..df3d53332c7cd 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -340,4 +340,11 @@ full_cg_backend_configs = { "cudagraph_mode": "FULL_AND_PIECEWISE", }, ), + "RocmAttn": BackendConfig( + name="RocmAttn", + env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"}, + comp_config={ + "cudagraph_mode": "FULL", + }, + ), } diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 24611a4aaa1b8..12ed59b6e863b 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1436,6 +1436,65 @@ def test_get_kv_cache_config_one_worker(): ], ) + # 6 full + 5 sliding, pad to 6 full + 6 sliding. This is a typical case for gpt-oss + # eagle where there is only one more full attention layer than sliding window layers + kv_cache_specs_hybrid = { + "layer_1": new_kv_cache_spec(), + "layer_2": new_kv_cache_spec(), + "layer_3": new_kv_cache_spec(), + "layer_4": new_kv_cache_spec(), + "layer_5": new_kv_cache_spec(), + "layer_6": new_kv_cache_spec(), + "layer_7": new_sliding_window_spec(), + "layer_8": new_sliding_window_spec(), + "layer_9": new_sliding_window_spec(), + "layer_10": new_sliding_window_spec(), + "layer_11": new_sliding_window_spec(), + } + + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 6 * 32] + )[0] + print(kv_cache_config_hybrid) + assert kv_cache_config_hybrid == KVCacheConfig( + num_blocks=32, + kv_cache_tensors=[ + KVCacheTensor( + size=mem_per_block_per_layer * 32, + shared_by=["layer_1", "layer_7"], + ), + KVCacheTensor( + size=mem_per_block_per_layer * 32, + shared_by=["layer_2", "layer_8"], + ), + KVCacheTensor( + size=mem_per_block_per_layer * 32, + shared_by=["layer_3", "layer_9"], + ), + KVCacheTensor( + size=mem_per_block_per_layer * 32, + shared_by=["layer_4", "layer_10"], + ), + KVCacheTensor( + size=mem_per_block_per_layer * 32, + shared_by=["layer_5", "layer_11"], + ), + KVCacheTensor( + size=mem_per_block_per_layer * 32, + shared_by=["layer_6"], + ), + ], + kv_cache_groups=[ + KVCacheGroupSpec( + ["layer_1", "layer_2", "layer_3", "layer_4", "layer_5", "layer_6"], + new_kv_cache_spec(), + ), + KVCacheGroupSpec( + ["layer_7", "layer_8", "layer_9", "layer_10", "layer_11"], + new_sliding_window_spec(), + ), + ], + ) # different hidden size kv_cache_specs_hybrid = { "layer_1": new_kv_cache_spec(head_size=128), diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 09acde6e08faa..fe4153e609971 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -641,6 +641,34 @@ def test_schedule_concurrent_batches( scheduler.update_from_output(scheduler_output1, model_runner_output) +@pytest.mark.parametrize("enable_chunked_prefill", [True, False]) +def test_schedule_order(enable_chunked_prefill: bool): + scheduler = create_scheduler( + max_num_batched_tokens=1024, + max_num_seqs=3, + enable_chunked_prefill=enable_chunked_prefill, + ) + + # long requests + requests = create_requests(num_requests=2, num_tokens=800) + # short requests + requests += create_requests(num_requests=2, num_tokens=10) + + for request in requests: + scheduler.add_request(request) + + scheduler_output1 = scheduler.schedule() + + if enable_chunked_prefill: + # When enable chunked prefill, long requests will be chunked. + assert len(scheduler_output1.scheduled_new_reqs) == 2 + else: + # When disable chunked prefill, should not skip the long requests, + # and scheduling subsequent short requests in advance, + # even though there is still token budgets remaining. + assert len(scheduler_output1.scheduled_new_reqs) == 1 + + def test_preempt_during_execution(): # NOTE(woosuk): The actual number of available blocks is 10 instead of 11 # because block 0 is reserved as the null block. diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 6830f68736453..7537c7a60476b 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -42,6 +42,7 @@ def create_scheduler( model: str = "facebook/opt-125m", max_num_seqs: int = 16, max_num_batched_tokens: int = 8192, + enable_chunked_prefill: bool = True, enable_prefix_caching: bool = False, long_prefill_token_threshold: int = 0, disable_chunked_mm_input: bool = False, @@ -76,7 +77,7 @@ def create_scheduler( max_model_len=max_model_len, long_prefill_token_threshold=long_prefill_token_threshold, disable_chunked_mm_input=disable_chunked_mm_input, - enable_chunked_prefill=True, + enable_chunked_prefill=enable_chunked_prefill, async_scheduling=async_scheduling, ) model_config = ModelConfig( diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py index d6bde16eba36b..7f9c2a0571c3c 100644 --- a/tests/v1/cudagraph/test_cudagraph_mode.py +++ b/tests/v1/cudagraph/test_cudagraph_mode.py @@ -35,14 +35,22 @@ def temporary_environ(env_vars): # test attention backend and cudagraph_mode combo # (backend_name, cudagraph_mode, supported) -combo_cases_1 = [ - ("FA3", "FULL", True), - ("FA3", "FULL_AND_PIECEWISE", True), - ("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE - ("FA2", "FULL_AND_PIECEWISE", True), - ("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE - ("FlashInfer", "FULL_AND_PIECEWISE", True), -] +if current_platform.is_rocm(): + combo_cases_1 = [ + ("RocmAttn", "FULL", True), + ("RocmAttn", "FULL_AND_PIECEWISE", True), + ("TritonAttn", "FULL", True), + ("TritonAttn", "FULL_AND_PIECEWISE", True), + ] +else: + combo_cases_1 = [ + ("FA3", "FULL", True), + ("FA3", "FULL_AND_PIECEWISE", True), + ("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE + ("FA2", "FULL_AND_PIECEWISE", True), + ("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE + ("FlashInfer", "FULL_AND_PIECEWISE", True), + ] @pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1) @@ -92,18 +100,32 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte # test cudagraph_mode with different compilation mode. # (backend_name, cudagraph_mode, compilation_mode, supported) -combo_cases_2 = [ - ("FA2", "FULL", CompilationMode.NONE, True), - ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), - ("FA2", "PIECEWISE", CompilationMode.NONE, False), - ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), - ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), - ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), - ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), - ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), - ("FA2", "NONE", CompilationMode.NONE, True), - ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True), -] +if current_platform.is_rocm(): + combo_cases_2 = [ + ("RocmAttn", "FULL", CompilationMode.NONE, True), + ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True), + ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False), + ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), + ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True), + ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), + ("RocmAttn", "NONE", CompilationMode.NONE, True), + ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True), + ] +else: + combo_cases_2 = [ + ("FA2", "FULL", CompilationMode.NONE, True), + ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True), + ("FA2", "PIECEWISE", CompilationMode.NONE, False), + ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False), + ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True), + ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True), + ("FA2", "NONE", CompilationMode.NONE, True), + ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True), + ] @pytest.mark.parametrize( diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py new file mode 100644 index 0000000000000..9f6a6614fc1fd --- /dev/null +++ b/tests/v1/distributed/test_eagle_dp.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import os +from contextlib import AsyncExitStack +from dataclasses import replace + +import pytest + +from vllm import SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.sampling_params import RequestOutputKind +from vllm.v1.engine.async_llm import AsyncLLM + +DP_SIZE = int(os.getenv("DP_SIZE", 2)) + + +@pytest.mark.asyncio +async def test_run_eagle_dp(): + target_model = "meta-llama/Llama-3.1-8B-Instruct" + draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B" + + engine_args = AsyncEngineArgs( + model=target_model, + tokenizer_mode="auto", + enforce_eager=False, + tensor_parallel_size=int(os.getenv("TP_SIZE", 1)), + data_parallel_size=DP_SIZE, + data_parallel_backend="mp", # ray takes more time + trust_remote_code=True, + max_model_len=16384, + ) + + eagle_engine_args = replace( + engine_args, + speculative_config={ + "model": draft_model, + "method": "eagle", + "num_speculative_tokens": 3, + }, + ) + + prompt = "This is a test of data parallel with eagle" + num_expected_tokens = 100 + sampling_params = SamplingParams( + min_tokens=num_expected_tokens, + max_tokens=num_expected_tokens, + ignore_eos=True, + output_kind=RequestOutputKind.FINAL_ONLY, + temperature=0, + ) + + async def generate_with_timeout(given_engine: AsyncLLM): + async for out in given_engine.generate( + request_id="test-eagle-dp", prompt=prompt, sampling_params=sampling_params + ): + token_ids = out.outputs[0].token_ids + assert len(token_ids) == num_expected_tokens + return token_ids + + async def engine_create_and_generate(engine_args: AsyncEngineArgs): + async with AsyncExitStack() as after: + engine = AsyncLLM.from_engine_args(engine_args) + after.callback(engine.shutdown) + + token_ids = await asyncio.wait_for( + generate_with_timeout(engine), timeout=30 + ) + + assert not engine.output_processor.has_unfinished_requests() + return token_ids + + token_ids_with_eagle = await engine_create_and_generate(eagle_engine_args) + token_ids_no_eagle = await engine_create_and_generate(engine_args) + + # Test for correctness + assert token_ids_with_eagle == token_ids_no_eagle diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index d1b037b7956cf..85f108786c05a 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -3,7 +3,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json -from dataclasses import fields from enum import Enum from typing import TYPE_CHECKING, Any @@ -21,7 +20,6 @@ from vllm.outputs import RequestOutput from vllm.platforms import current_platform from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager from vllm.sampling_params import ( - GuidedDecodingParams, SamplingParams, StructuredOutputsParams, ) @@ -108,23 +106,6 @@ class CarDescription(BaseModel): car_type: CarType -def test_guided_decoding_deprecated(): - with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"): - guided_decoding = GuidedDecodingParams(json_object=True) - - structured_outputs = StructuredOutputsParams(json_object=True) - assert fields(guided_decoding) == fields(structured_outputs) - - with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"): - sp1 = SamplingParams(guided_decoding=guided_decoding) - - with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"): - sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding) - - assert sp1 == sp2 - assert sp1.structured_outputs == guided_decoding - - @pytest.mark.parametrize( "model_name, backend, tokenizer_mode, speculative_config", PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, @@ -899,13 +880,11 @@ def test_structured_output_batched_with_non_structured_outputs_requests( output_json = json.loads(generated_text) -@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"]) -def test_structured_output_with_structural_tag( - guided_decoding_backend: str, -): +@pytest.mark.parametrize("backend", ["xgrammar"]) +def test_structured_output_with_structural_tag(backend: str): llm = LLM( model="Qwen/Qwen2.5-1.5B-Instruct", - guided_decoding_backend=guided_decoding_backend, + structured_outputs_config=StructuredOutputsConfig(backend=backend), ) structural_tag_config = { @@ -923,7 +902,7 @@ def test_structured_output_with_structural_tag( sampling_params = SamplingParams( temperature=0.0, max_tokens=500, - guided_decoding=StructuredOutputsParams( + structured_outputs=StructuredOutputsParams( structural_tag=json.dumps(structural_tag_config) ), ) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index db79b3f5e8bcb..a8f472d147a0d 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -294,6 +294,8 @@ def _rocm_aiter_mla_decode_fwd_impl( kv_last_page_lens: torch.Tensor | None = None, sm_scale: float = 1.0, logit_cap: float = 0.0, + q_scale: torch.Tensor | None = None, + kv_scale: torch.Tensor | None = None, ) -> None: from aiter.mla import mla_decode_fwd @@ -308,6 +310,8 @@ def _rocm_aiter_mla_decode_fwd_impl( max_seqlen_qo, sm_scale=sm_scale, logit_cap=logit_cap, + q_scale=q_scale, + kv_scale=kv_scale, ) @@ -322,6 +326,8 @@ def _rocm_aiter_mla_decode_fwd_fake( kv_last_page_lens: torch.Tensor | None = None, sm_scale: float = 1.0, logit_cap: float = 0.0, + q_scale: torch.Tensor | None = None, + kv_scale: torch.Tensor | None = None, ) -> None: pass @@ -806,6 +812,8 @@ class rocm_aiter_ops: kv_indices: torch.Tensor | None = None, kv_last_page_lens: torch.Tensor | None = None, logit_cap: float = 0.0, + q_scale: torch.Tensor | None = None, + kv_scale: torch.Tensor | None = None, ): torch.ops.vllm.rocm_aiter_mla_decode_fwd( q, @@ -818,6 +826,8 @@ class rocm_aiter_ops: kv_last_page_lens, sm_scale=sm_scale, logit_cap=logit_cap, + q_scale=q_scale, + kv_scale=kv_scale, ) @staticmethod diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0f625a7945241..4a1bcc761f994 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -2201,7 +2201,8 @@ def gather_and_maybe_dequant_cache( dst: torch.Tensor, block_table: torch.Tensor, cu_seq_lens: torch.Tensor, - batch_size: int, + token_to_seq: torch.Tensor, + num_tokens: int, kv_cache_dtype: str, scale: torch.Tensor, seq_starts: torch.Tensor | None = None, @@ -2211,7 +2212,8 @@ def gather_and_maybe_dequant_cache( dst, block_table, cu_seq_lens, - batch_size, + token_to_seq, + num_tokens, kv_cache_dtype, scale, seq_starts, diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py index 67c5f7dbba9c0..af6766bdd1615 100644 --- a/vllm/attention/ops/common.py +++ b/vllm/attention/ops/common.py @@ -194,7 +194,6 @@ def _cp_lse_common( cp_attn_lse = cp_attn_lse.contiguous() lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses) out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx) - assert out.is_contiguous() return out, lse diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 1e66f21ff6388..2d8dd4c51c7ef 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -63,13 +63,14 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: else: logger.debug("Using InductorAdaptor") return InductorAdaptor() - else: - assert compilation_config.backend == "eager", ( - "Custom backends not supported with CompilationMode.VLLM_COMPILE" - ) - + elif compilation_config.backend == "eager": logger.debug("Using EagerAdaptor") return EagerAdaptor() + else: + logger.debug("Using custom backend: %s", compilation_config.backend) + compiler = resolve_obj_by_qualname(current_platform.get_compile_backend())() + assert isinstance(compiler, CompilerInterface) + return compiler class CompilerManager: @@ -545,7 +546,10 @@ class VllmBackend: self.prefix = prefix or model_tag # Passes to run on the graph post-grad. - self.post_grad_pass_manager = PostGradPassManager() + self.pass_manager = resolve_obj_by_qualname( + current_platform.get_pass_manager_cls() + )() + self.pass_key = current_platform.pass_key self.sym_tensor_indices = [] self.input_buffers = [] @@ -562,24 +566,20 @@ class VllmBackend: def configure_post_pass(self): config = self.compilation_config - self.post_grad_pass_manager.configure(self.vllm_config) + self.pass_manager.configure(self.vllm_config) # Post-grad custom passes are run using the post_grad_custom_post_pass # hook. If a pass for that hook exists, add it to the pass manager. inductor_config = config.inductor_compile_config - PASS_KEY = "post_grad_custom_post_pass" - if PASS_KEY in inductor_config: - if isinstance(inductor_config[PASS_KEY], PostGradPassManager): + if self.pass_key in inductor_config: + if isinstance(inductor_config[self.pass_key], PostGradPassManager): # PassManager already added to config, make sure it's correct - assert ( - inductor_config[PASS_KEY].uuid() - == self.post_grad_pass_manager.uuid() - ) + assert inductor_config[self.pass_key].uuid() == self.pass_manager.uuid() else: # Config should automatically wrap all inductor passes - assert isinstance(inductor_config[PASS_KEY], InductorPass) - self.post_grad_pass_manager.add(inductor_config[PASS_KEY]) - inductor_config[PASS_KEY] = self.post_grad_pass_manager + assert isinstance(inductor_config[self.pass_key], InductorPass) + self.pass_manager.add(inductor_config[self.pass_key]) + inductor_config[self.pass_key] = self.pass_manager def __call__( self, graph: fx.GraphModule, example_inputs diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 42eccf9f41123..556b2d9168b32 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -331,9 +331,9 @@ class CompilationConfig: We use string to avoid serialization issues when using compilation in a distributed setting. When the compilation mode is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). When the - compilation mode is 3, the backend is used for the piecewise compilation - (it sees a part of the graph). The backend can not be custom for compilation - mode 3, i.e. the backend must be either eager or inductor. Furthermore, + compilation mode is 3, the backend supports both whole graph and piecewise + compilation, available backends include eager, inductor, and custom backends, + the latter of which can be defined via `get_compile_backend`. Furthermore, compilation is only piecewise if splitting ops is set accordingly and use_inductor_graph_partition is off. Note that the default options for splitting ops are sufficient for piecewise compilation. @@ -768,7 +768,7 @@ class CompilationConfig: self.backend = "inductor" if self.use_inductor else "eager" if self.backend == "": - self.backend = current_platform.simple_compile_backend + self.backend = current_platform.get_compile_backend() def init_backend(self, vllm_config: "VllmConfig") -> str | Callable: """ @@ -800,9 +800,7 @@ class CompilationConfig: assert self.mode == CompilationMode.VLLM_COMPILE if self.backend not in ["eager", "inductor"]: - raise ValueError( - f"Invalid backend for piecewise compilation: {self.backend}" - ) + logger.info("Using OOT custom backend for compilation.") from vllm.compilation.backends import VllmBackend diff --git a/vllm/config/model.py b/vllm/config/model.py index 49688e17cf932..caa9a3440c41d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -33,6 +33,7 @@ from vllm.transformers_utils.config import ( try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope, + uses_xdrope_dim, ) from vllm.transformers_utils.gguf_utils import ( maybe_patch_hf_config_from_gguf, @@ -585,16 +586,26 @@ class ModelConfig: else: # task == "auto" pass else: - debug_info = { - "architectures": architectures, - "is_generative_model": is_generative_model, - "is_pooling_model": is_pooling_model, - } - raise AssertionError( - "The model should be a generative or " - "pooling model when task is set to " - f"{self.task!r}. Found: {debug_info}" - ) + # Neither generative nor pooling model - try to convert if possible + if is_pooling_task: + runner = "pooling" + convert = _task_to_convert(self.task) + msg_hint = ( + "Please replace this option with `--runner pooling " + f"--convert {convert}` to continue using this model " + "as a pooling model." + ) + else: + debug_info = { + "architectures": architectures, + "is_generative_model": is_generative_model, + "is_pooling_model": is_pooling_model, + } + raise AssertionError( + "The model should be a generative or " + "pooling model when task is set to " + f"{self.task!r}. Found: {debug_info}" + ) self.runner = runner self.convert = convert @@ -1605,6 +1616,10 @@ class ModelConfig: def uses_mrope(self) -> bool: return uses_mrope(self.hf_config) + @property + def uses_xdrope_dim(self) -> int: + return uses_xdrope_dim(self.hf_config) + @property def is_multimodal_model(self) -> bool: return self.multimodal_config is not None diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index ad438a8b464e0..913e97250d3d3 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -141,22 +141,6 @@ class ParallelConfig: - "deepep_high_throughput": Use deepep high-throughput kernels - "deepep_low_latency": Use deepep low-latency kernels - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl""" - num_redundant_experts: int | None = None - """`num_redundant_experts` is deprecated and has been replaced with - `eplb_config.num_redundant_experts`. This will be removed in v0.12.0. - Please use `eplb_config.num_redundant_experts` instead.""" - eplb_window_size: int | None = None - """`eplb_window_size` is deprecated and has been replaced with - `eplb_config.window_size`. This will be removed in v0.12.0. - Please use `eplb_config.window_size` instead.""" - eplb_step_interval: int | None = None - """`eplb_step_interval` is deprecated and has been replaced with - `eplb_config.step_interval`. This will be removed in v0.12.0. - Please use `eplb_config.step_interval` instead.""" - eplb_log_balancedness: bool | None = None - """`eplb_log_balancedness` is deprecated and has been replaced with - `eplb_config.log_balancedness`. This will be removed in v0.12.0. - Please use `eplb_config.log_balancedness` instead.""" max_parallel_loading_workers: int | None = None """Maximum number of parallel loading workers when loading model @@ -516,40 +500,6 @@ class ParallelConfig: "--all2all-backend command-line argument instead." ) - # Forward deprecated fields to their new location - if self.num_redundant_experts is not None: - self.eplb_config.num_redundant_experts = self.num_redundant_experts - logger.warning_once( - "num_redundant_experts is deprecated and has been replaced " - "with eplb_config.num_redundant_experts. This will be removed " - "in v0.12.0. Changing this field after initialization will " - "have no effect." - ) - if self.eplb_window_size is not None: - self.eplb_config.window_size = self.eplb_window_size - logger.warning_once( - "eplb_window_size is deprecated and has been replaced " - "with eplb_config.window_size. This will be removed " - "in v0.12.0. Changing this field after initialization will " - "have no effect." - ) - if self.eplb_step_interval is not None: - self.eplb_config.step_interval = self.eplb_step_interval - logger.warning_once( - "eplb_step_interval is deprecated and has been replaced " - "with eplb_config.step_interval. This will be removed " - "in v0.12.0. Changing this field after initialization will " - "have no effect." - ) - if self.eplb_log_balancedness is not None: - self.eplb_config.log_balancedness = self.eplb_log_balancedness - logger.warning_once( - "eplb_log_balancedness is deprecated and has been replaced " - "with eplb_config.log_balancedness. This will be removed " - "in v0.12.0. Changing this field after initialization will " - "have no effect." - ) - # Continue with the rest of the initialization self.world_size = ( self.pipeline_parallel_size diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py index eb1f173b11925..7a049b003cf73 100644 --- a/vllm/distributed/device_communicators/symm_mem.py +++ b/vllm/distributed/device_communicators/symm_mem.py @@ -131,7 +131,7 @@ class SymmMemCommunicator: return None if out is None: out = torch.empty_like(inp) - self.buffer[: inp.numel()].copy_(inp.view(-1)) + self.buffer[: inp.numel()].copy_(inp.reshape(-1)) # Determine which algorithm to use use_multimem = False diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 7c0911240493c..493938d4aad92 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -4,7 +4,6 @@ import contextlib import copy import logging import math -import os import queue import threading import time @@ -810,9 +809,6 @@ class NixlConnectorWorker: self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config( "backends", ["UCX"] ) - # TODO temporary, once nixl allows for telemetry flag in config - # (next release), we can remove this env var. - os.environ["NIXL_TELEMETRY_ENABLE"] = "1" # Agent. non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"] @@ -828,10 +824,11 @@ class NixlConnectorWorker: if nixl_agent_config is None: config = None else: + # Enable telemetry by default for NIXL 0.7.1 and above. config = ( - nixl_agent_config(backends=self.nixl_backends) + nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True) if len(non_ucx_backends) > 0 - else nixl_agent_config(num_threads=num_threads) + else nixl_agent_config(num_threads=num_threads, capture_telemetry=True) ) self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b7c8f56e18c52..8338e54d4fd85 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -502,11 +502,6 @@ class EngineArgs: ) reasoning_parser: str = StructuredOutputsConfig.reasoning_parser reasoning_parser_plugin: str | None = None - # Deprecated guided decoding fields - guided_decoding_backend: str | None = None - guided_decoding_disable_fallback: bool | None = None - guided_decoding_disable_any_whitespace: bool | None = None - guided_decoding_disable_additional_properties: bool | None = None logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern @@ -725,19 +720,6 @@ class EngineArgs: "--reasoning-parser-plugin", **structured_outputs_kwargs["reasoning_parser_plugin"], ) - # Deprecated guided decoding arguments - for arg, type in [ - ("--guided-decoding-backend", str), - ("--guided-decoding-disable-fallback", bool), - ("--guided-decoding-disable-any-whitespace", bool), - ("--guided-decoding-disable-additional-properties", bool), - ]: - structured_outputs_group.add_argument( - arg, - type=type, - help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."), - deprecated=True, - ) # Parallel arguments parallel_kwargs = get_kwargs(ParallelConfig) @@ -855,30 +837,6 @@ class EngineArgs: "--expert-placement-strategy", **parallel_kwargs["expert_placement_strategy"], ) - parallel_group.add_argument( - "--num-redundant-experts", - type=int, - help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.", - deprecated=True, - ) - parallel_group.add_argument( - "--eplb-window-size", - type=int, - help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.", - deprecated=True, - ) - parallel_group.add_argument( - "--eplb-step-interval", - type=int, - help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.", - deprecated=True, - ) - parallel_group.add_argument( - "--eplb-log-balancedness", - action=argparse.BooleanOptionalAction, - help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.", - deprecated=True, - ) parallel_group.add_argument( "--max-parallel-loading-workers", @@ -1612,6 +1570,12 @@ class EngineArgs: model_config.skip_tokenizer_init = True logger.info("Skipping tokenizer initialization for tokens-only mode.") + if self.async_scheduling and not self.disable_nccl_for_dp_synchronization: + logger.info( + "Disabling NCCL for DP synchronization when using async scheduling." + ) + self.disable_nccl_for_dp_synchronization = True + # Forward the deprecated CLI args to the EPLB config. if self.num_redundant_experts is not None: self.eplb_config.num_redundant_experts = self.num_redundant_experts @@ -1736,21 +1700,6 @@ class EngineArgs: self.reasoning_parser_plugin ) - # Forward the deprecated CLI args to the StructuredOutputsConfig - so_config = self.structured_outputs_config - if self.guided_decoding_backend is not None: - so_config.guided_decoding_backend = self.guided_decoding_backend - if self.guided_decoding_disable_fallback is not None: - so_config.disable_fallback = self.guided_decoding_disable_fallback - if self.guided_decoding_disable_any_whitespace is not None: - so_config.disable_any_whitespace = ( - self.guided_decoding_disable_any_whitespace - ) - if self.guided_decoding_disable_additional_properties is not None: - so_config.disable_additional_properties = ( - self.guided_decoding_disable_additional_properties - ) - observability_config = ObservabilityConfig( show_hidden_metrics_for_version=self.show_hidden_metrics_for_version, otlp_traces_endpoint=self.otlp_traces_endpoint, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b352c3ad01db0..c4023a6185289 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -652,62 +652,6 @@ class ChatCompletionRequest(OpenAIBaseModel): default=None, description="Additional kwargs for structured outputs", ) - guided_json: str | dict | BaseModel | None = Field( - default=None, - description=( - "`guided_json` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `json` to `structured_outputs` instead." - ), - ) - guided_regex: str | None = Field( - default=None, - description=( - "`guided_regex` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `regex` to `structured_outputs` instead." - ), - ) - guided_choice: list[str] | None = Field( - default=None, - description=( - "`guided_choice` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `choice` to `structured_outputs` instead." - ), - ) - guided_grammar: str | None = Field( - default=None, - description=( - "`guided_grammar` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `grammar` to `structured_outputs` instead." - ), - ) - structural_tag: str | None = Field( - default=None, - description=( - "`structural_tag` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `structural_tag` to `structured_outputs` instead." - ), - ) - guided_decoding_backend: str | None = Field( - default=None, - description=( - "`guided_decoding_backend` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please remove it from your request." - ), - ) - guided_whitespace_pattern: str | None = Field( - default=None, - description=( - "`guided_whitespace_pattern` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `whitespace_pattern` to `structured_outputs` instead." - ), - ) priority: int = Field( default=0, description=( @@ -717,7 +661,7 @@ class ChatCompletionRequest(OpenAIBaseModel): ), ) request_id: str = Field( - default_factory=lambda: f"{random_uuid()}", + default_factory=random_uuid, description=( "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " @@ -841,20 +785,6 @@ class ChatCompletionRequest(OpenAIBaseModel): if prompt_logprobs is None and self.echo: prompt_logprobs = self.top_logprobs - # Forward deprecated guided_* parameters to structured_outputs - if self.structured_outputs is None: - kwargs = dict[str, Any]( - json=self.guided_json, - regex=self.guided_regex, - choice=self.guided_choice, - grammar=self.guided_grammar, - whitespace_pattern=self.guided_whitespace_pattern, - structural_tag=self.structural_tag, - ) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - self.structured_outputs = StructuredOutputsParams(**kwargs) - response_format = self.response_format if response_format is not None: # If structured outputs wasn't already enabled, @@ -863,24 +793,23 @@ class ChatCompletionRequest(OpenAIBaseModel): self.structured_outputs = StructuredOutputsParams() # Set structured output params for response format - if response_format is not None: - if response_format.type == "json_object": - self.structured_outputs.json_object = True - elif response_format.type == "json_schema": - json_schema = response_format.json_schema - assert json_schema is not None - self.structured_outputs.json = json_schema.json_schema - elif response_format.type == "structural_tag": - structural_tag = response_format - assert structural_tag is not None and isinstance( - structural_tag, - ( - LegacyStructuralTagResponseFormat, - StructuralTagResponseFormat, - ), - ) - s_tag_obj = structural_tag.model_dump(by_alias=True) - self.structured_outputs.structural_tag = json.dumps(s_tag_obj) + if response_format.type == "json_object": + self.structured_outputs.json_object = True + elif response_format.type == "json_schema": + json_schema = response_format.json_schema + assert json_schema is not None + self.structured_outputs.json = json_schema.json_schema + elif response_format.type == "structural_tag": + structural_tag = response_format + assert structural_tag is not None and isinstance( + structural_tag, + ( + LegacyStructuralTagResponseFormat, + StructuralTagResponseFormat, + ), + ) + s_tag_obj = structural_tag.model_dump(by_alias=True) + self.structured_outputs.structural_tag = json.dumps(s_tag_obj) extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -1140,58 +1069,6 @@ class CompletionRequest(OpenAIBaseModel): default=None, description="Additional kwargs for structured outputs", ) - guided_json: str | dict | BaseModel | None = Field( - default=None, - description=( - "`guided_json` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `json` to `structured_outputs` instead." - ), - ) - guided_regex: str | None = Field( - default=None, - description=( - "`guided_regex` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `regex` to `structured_outputs` instead." - ), - ) - guided_choice: list[str] | None = Field( - default=None, - description=( - "`guided_choice` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `choice` to `structured_outputs` instead." - ), - ) - guided_grammar: str | None = Field( - default=None, - description=( - "`guided_grammar` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `grammar` to `structured_outputs` instead." - ), - ) - structural_tag: str | None = Field( - default=None, - description=("If specified, the output will follow the structural tag schema."), - ) - guided_decoding_backend: str | None = Field( - default=None, - description=( - "`guided_decoding_backend` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please remove it from your request." - ), - ) - guided_whitespace_pattern: str | None = Field( - default=None, - description=( - "`guided_whitespace_pattern` is deprecated. " - "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. " - "Please pass `whitespace_pattern` to `structured_outputs` instead." - ), - ) priority: int = Field( default=0, description=( @@ -1201,7 +1078,7 @@ class CompletionRequest(OpenAIBaseModel): ), ) request_id: str = Field( - default_factory=lambda: f"{random_uuid()}", + default_factory=random_uuid, description=( "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " @@ -1336,35 +1213,31 @@ class CompletionRequest(OpenAIBaseModel): echo_without_generation = self.echo and self.max_tokens == 0 - guided_json_object = None - if self.response_format is not None: - if self.response_format.type == "json_object": - guided_json_object = True - elif self.response_format.type == "json_schema": - json_schema = self.response_format.json_schema + response_format = self.response_format + if response_format is not None: + # If structured outputs wasn't already enabled, + # we must enable it for these features to work + if self.structured_outputs is None: + self.structured_outputs = StructuredOutputsParams() + + # Set structured output params for response format + if response_format.type == "json_object": + self.structured_outputs.json_object = True + elif response_format.type == "json_schema": + json_schema = response_format.json_schema assert json_schema is not None - self.guided_json = json_schema.json_schema - elif self.response_format.type == "structural_tag": - structural_tag = self.response_format + self.structured_outputs.json = json_schema.json_schema + elif response_format.type == "structural_tag": + structural_tag = response_format assert structural_tag is not None and isinstance( - structural_tag, StructuralTagResponseFormat + structural_tag, + ( + LegacyStructuralTagResponseFormat, + StructuralTagResponseFormat, + ), ) s_tag_obj = structural_tag.model_dump(by_alias=True) - self.structural_tag = json.dumps(s_tag_obj) - - # Forward deprecated guided_* parameters to structured_outputs - if self.structured_outputs is None: - kwargs = dict[str, Any]( - json=self.guided_json, - json_object=guided_json_object, - regex=self.guided_regex, - choice=self.guided_choice, - grammar=self.guided_grammar, - whitespace_pattern=self.guided_whitespace_pattern, - ) - kwargs = {k: v for k, v in kwargs.items() if v is not None} - if len(kwargs) > 0: - self.structured_outputs = StructuredOutputsParams(**kwargs) + self.structured_outputs.structural_tag = json.dumps(s_tag_obj) extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} if self.kv_transfer_params: @@ -1502,7 +1375,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): ), ) request_id: str = Field( - default_factory=lambda: f"{random_uuid()}", + default_factory=random_uuid, description=( "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " @@ -1597,7 +1470,7 @@ class EmbeddingChatRequest(OpenAIBaseModel): ), ) request_id: str = Field( - default_factory=lambda: f"{random_uuid()}", + default_factory=random_uuid, description=( "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " @@ -2019,7 +1892,7 @@ class ClassificationCompletionRequest(OpenAIBaseModel): ), ) request_id: str = Field( - default_factory=lambda: f"{random_uuid()}", + default_factory=random_uuid, description=( "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " @@ -2110,7 +1983,7 @@ class ClassificationChatRequest(OpenAIBaseModel): ) request_id: str = Field( - default_factory=lambda: f"{random_uuid()}", + default_factory=random_uuid, description=( "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " @@ -3221,7 +3094,7 @@ class TranslationResponseVerbose(OpenAIBaseModel): ####### Tokens IN <> Tokens OUT ####### class GenerateRequest(BaseModel): request_id: str = Field( - default_factory=lambda: f"{random_uuid()}", + default_factory=random_uuid, description=( "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " @@ -3278,7 +3151,7 @@ class GenerateResponseChoice(BaseModel): class GenerateResponse(BaseModel): request_id: str = Field( - default_factory=lambda: f"{random_uuid()}", + default_factory=random_uuid, description=( "The request_id related to this request. If the caller does " "not set it, a random_uuid will be generated. This id is used " diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 6cc685acd6728..2a870dbc3afac 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -273,6 +273,11 @@ class OpenAIServingChat(OpenAIServing): try: for i, engine_prompt in enumerate(engine_prompts): prompt_text, _, _ = self._get_prompt_components(request_prompts[i]) + # If we are creating sub requests for multiple prompts, ensure that they + # have unique request ids. + sub_request_id = ( + request_id if len(engine_prompts) == 1 else f"{request_id}_{i}" + ) if self.default_sampling_params is None: self.default_sampling_params = {} @@ -301,7 +306,7 @@ class OpenAIServingChat(OpenAIServing): ) self._log_inputs( - request_id, + sub_request_id, request_prompts[i], params=sampling_params, lora_request=lora_request, @@ -316,14 +321,14 @@ class OpenAIServingChat(OpenAIServing): if isinstance(sampling_params, BeamSearchParams): generator = self.beam_search( prompt=engine_prompt, - request_id=request_id, + request_id=sub_request_id, params=sampling_params, lora_request=lora_request, trace_headers=trace_headers, ) else: engine_request, tokenization_kwargs = await self._process_inputs( - request_id, + sub_request_id, engine_prompt, sampling_params, lora_request=lora_request, @@ -334,7 +339,7 @@ class OpenAIServingChat(OpenAIServing): generator = self.engine_client.generate( engine_request, sampling_params, - request_id, + sub_request_id, lora_request=lora_request, trace_headers=trace_headers, priority=request.priority, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 7dab5dbacd28c..09a135b701d05 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1242,16 +1242,19 @@ class OpenAIServing: ): prompt_text, _, _ = self._get_prompt_components(request_prompt) orig_priority = priority + sub_request = 0 while True: + # Ensure that each sub-request has a unique request id. + sub_request_id = f"{request_id}_{sub_request}" self._log_inputs( - request_id, + sub_request_id, request_prompt, params=sampling_params, lora_request=lora_request, ) trace_headers = kwargs.get("trace_headers") engine_request, tokenization_kwargs = await self._process_inputs( - request_id, + sub_request_id, engine_prompt, sampling_params, lora_request=lora_request, @@ -1262,7 +1265,7 @@ class OpenAIServing: generator = self.engine_client.generate( engine_request, sampling_params, - request_id, + sub_request_id, lora_request=lora_request, priority=priority, prompt_text=prompt_text, @@ -1295,6 +1298,7 @@ class OpenAIServing: sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids) # OPTIMIZATION priority = orig_priority - 1 + sub_request += 1 def _get_prompt_components( self, @@ -1345,11 +1349,12 @@ class OpenAIServing: raw_request: Request | None, default: str | None = None ) -> str | None: """Pulls the request id to use from a header, if provided""" - default = default or random_uuid() - if raw_request is None: - return default + if raw_request is not None and ( + (req_id := raw_request.headers.get("X-Request-Id")) is not None + ): + return req_id - return raw_request.headers.get("X-Request-Id", default) + return random_uuid() if default is None else default @staticmethod def _get_data_parallel_rank(raw_request: Request | None) -> int | None: diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index b9b9b1ab30ad8..3dece07748cc4 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -201,10 +201,10 @@ class OpenAISpeechToText(OpenAIServing): self.engine_client.generate( prompt, sampling_params, - request_id, + f"{request_id}_{i}", lora_request=lora_request, ) - for prompt in prompts + for i, prompt in enumerate(prompts) ] except ValueError as e: # TODO: Use a vllm-specific Validation Error diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 8b33727f05fbc..be7f673e5618f 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -812,19 +812,19 @@ def override_envs_for_invariance(): # "TRITON_MLA", ] if curr_attn_backend not in supported_backends: - warning = ( - "Forcibly updating attention backend to" - f" {supported_backends[0]} for batch_invariant. " - f" Supported backends: {supported_backends}." + error = ( + "VLLM batch_invariant mode requires an attention backend in " + f"{supported_backends}, but got '{curr_attn_backend}'. " + "Please set the 'VLLM_ATTENTION_BACKEND' environment variable " + "to one of the supported backends before enabling batch_invariant." ) - logger.warning_once(warning) - os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0] + raise RuntimeError(error) if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]: warning = ( "You are using a decode-invariant form of batch invariance. " "This will not be invariant between prefill and decode." ) - logger.warning_once(warning) + logger.warning_once(warning, scope="local") os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0" os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index 572307052b489..659a2d4ee5b39 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -6,22 +6,7 @@ import torch from torch.nn import functional as F from vllm import _custom_ops as ops - - -def silu_and_mul(x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - return F.silu(x[..., :d]) * x[..., d:] - - -def swigluoai_and_mul( - x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0 -) -> torch.Tensor: - d = x.shape[-1] // 2 - gate, up = x[..., :d], x[..., d:] - gate = gate.clamp(max=limit) - up = up.clamp(min=-limit, max=limit) - glu = gate * torch.sigmoid(alpha * gate) - return (up + 1) * glu +from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul def grouped_topk( @@ -227,6 +212,11 @@ class CPUFusedMOE: layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False) + self.act_to_impl = { + "silu": SiluAndMul(), + "swigluoai": SwigluOAIAndMul(), + } + def __call__( self, layer: torch.nn.Module, @@ -246,7 +236,7 @@ class CPUFusedMOE: apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: - assert activation in {"silu", "swigluoai"}, f"{activation} is not supported." + assert activation in self.act_to_impl, f"{activation} is not supported." assert not apply_router_weight_on_input topk_weights, topk_ids = select_experts( hidden_states=x, @@ -283,10 +273,7 @@ class CPUFusedMOE: tokens_for_this_expert = sorted_tokens[start_idx:end_idx] gate_up = layer.gate_up_linear[i](tokens_for_this_expert) - if activation == "swigluoai": - gate_up = swigluoai_and_mul(gate_up) - else: - gate_up = silu_and_mul(gate_up) + gate_up = self.act_to_impl[activation].forward_native(gate_up) expert_out = layer.down_linear[i](gate_up) outputs.append(expert_out) start_idx = end_idx diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 073e90a4e6808..ef7090c349fc6 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -90,10 +90,14 @@ class FusedMoEMethodBase(QuantizeMethodBase): def allow_inplace(self) -> bool: return False + @property + def method_name(self) -> str: + return self.__class__.__name__ + @abstractmethod def apply( self, - layer: torch.nn.Module, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, top_k: int, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index c6dc95acdb636..c23c41df226f0 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -66,6 +66,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): def allow_inplace(self) -> bool: return self.old_quant_method.allow_inplace + @property + def method_name(self) -> str: + return self.old_quant_method.method_name + def create_weights( self, layer: torch.nn.Module, @@ -84,7 +88,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): def apply( self, - layer: torch.nn.Module, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -105,42 +109,9 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # Is getattr needed? - zero_expert_num = getattr(layer, "zero_expert_num", 0) - zero_expert_type = getattr(layer, "zero_expert_type", None) - - if enable_eplb: - if self.supports_eplb: - assert expert_load_view is not None - assert logical_to_physical_map is not None - assert logical_replica_count is not None - else: - raise NotImplementedError( - "EPLB is not supported for " - f"{self.old_quant_method.__class__.__name__}." - ) - topk_weights, topk_ids, zero_expert_result = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, - enable_eplb=enable_eplb, - expert_map=expert_map, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - global_num_experts=global_num_experts, - zero_expert_num=zero_expert_num, - zero_expert_type=zero_expert_type, ) result = self.fused_experts( @@ -156,7 +127,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): expert_map=None if self.disable_expert_map else expert_map, ) - if zero_expert_num != 0 and zero_expert_type is not None: + if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: assert not isinstance(result, tuple), ( "Shared + zero experts are mutually exclusive not yet supported" ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 6619b64b2bbc0..bb30f1292a5fa 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1510,30 +1510,11 @@ class FusedMoE(CustomOp): logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device() ) - @staticmethod def select_experts( + self, hidden_states: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - use_grouped_topk: bool, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - indices_type: torch.dtype | None = None, - enable_eplb: bool = False, - expert_map: torch.Tensor | None = None, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, - global_num_experts: int | None = None, - zero_expert_num: int | None = None, - zero_expert_type: str | None = None, - num_fused_shared_experts: int = 0, - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]: """ Route the input hidden states to the top-k experts based on the router logits. @@ -1552,6 +1533,27 @@ class FusedMoE(CustomOp): fused_topk_bias, ) + if self.enable_eplb: + if self.quant_method.supports_eplb: + if self.expert_load_view is None: + raise ValueError( + "enable_eplb=True requiere expert_load_view != None" + ) + if self.logical_to_physical_map is None: + raise ValueError( + "enable_eplb=True requiere logical_to_physical_map != None" + ) + if self.logical_replica_count is None: + raise ValueError( + "enable_eplb=True requiere logical_replica_count != None" + ) + else: + raise NotImplementedError( + f"EPLB is not supported for {self.quant_method.method_name}." + ) + + indices_type = self.quant_method.topk_indices_dtype + # Check if we should use a routing simulation strategy routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY if routing_strategy != "": @@ -1559,20 +1561,20 @@ class FusedMoE(CustomOp): hidden_states=hidden_states, router_logits=router_logits, strategy_name=routing_strategy, - top_k=top_k, + top_k=self.top_k, indices_type=indices_type, ) # DeepSeekv2 uses grouped_top_k - elif use_grouped_topk: - assert topk_group is not None - assert num_expert_group is not None + elif self.use_grouped_topk: + assert self.topk_group is not None + assert self.num_expert_group is not None if rocm_aiter_ops.is_fused_moe_enabled(): if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled(): - assert num_fused_shared_experts == 0 + assert self.num_fused_shared_experts == 0 grouped_topk_impl = partial( rocm_aiter_grouped_topk, - num_fused_shared_experts=num_fused_shared_experts, + num_fused_shared_experts=self.num_fused_shared_experts, ) else: grouped_topk_impl = grouped_topk @@ -1580,50 +1582,46 @@ class FusedMoE(CustomOp): topk_weights, topk_ids = grouped_topk_impl( hidden_states=hidden_states, gating_output=router_logits, - topk=top_k, - renormalize=renormalize, - num_expert_group=num_expert_group, - topk_group=topk_group, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, + topk=self.top_k, + renormalize=self.renormalize, + num_expert_group=self.num_expert_group, + topk_group=self.topk_group, + scoring_func=self.scoring_func, + routed_scaling_factor=self.routed_scaling_factor, + e_score_correction_bias=self.e_score_correction_bias, ) - elif e_score_correction_bias is not None: + elif self.e_score_correction_bias is not None: topk_weights, topk_ids = fused_topk_bias( hidden_states=hidden_states, gating_output=router_logits, - e_score_correction_bias=e_score_correction_bias.data, - topk=top_k, - renormalize=renormalize, + e_score_correction_bias=self.e_score_correction_bias.data, + topk=self.top_k, + renormalize=self.renormalize, ) - if routed_scaling_factor != 1.0: - topk_weights *= routed_scaling_factor - elif custom_routing_function is None: + if self.routed_scaling_factor != 1.0: + topk_weights *= self.routed_scaling_factor + elif self.custom_routing_function is None: topk_weights, topk_ids, token_expert_indices = fused_topk( hidden_states=hidden_states, gating_output=router_logits, - topk=top_k, - renormalize=renormalize, + topk=self.top_k, + renormalize=self.renormalize, indices_type=indices_type, ) else: - topk_weights, topk_ids = custom_routing_function( + topk_weights, topk_ids = self.custom_routing_function( hidden_states=hidden_states, gating_output=router_logits, - topk=top_k, - renormalize=renormalize, + topk=self.top_k, + renormalize=self.renormalize, ) - if enable_eplb: - assert expert_load_view is not None - assert logical_to_physical_map is not None - assert logical_replica_count is not None - + if self.enable_eplb: topk_ids = eplb_map_to_physical_and_record( topk_ids=topk_ids, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, + expert_load_view=self.expert_load_view, + logical_to_physical_map=self.logical_to_physical_map, + logical_replica_count=self.logical_replica_count, ) if (indices_type is not None) and topk_ids.dtype != indices_type: @@ -1633,16 +1631,16 @@ class FusedMoE(CustomOp): # Compute zero expert result if needed if ( - zero_expert_num is not None - and zero_expert_num > 0 - and zero_expert_type is not None - and global_num_experts is not None + self.zero_expert_num is not None + and self.zero_expert_num > 0 + and self.zero_expert_type is not None + and self.global_num_experts is not None ): zero_expert_result = zero_experts_compute_triton( expert_indices=topk_ids, expert_scales=topk_weights, - num_experts=global_num_experts, - zero_expert_type=zero_expert_type, + num_experts=self.global_num_experts, + zero_expert_type=self.zero_expert_type, hidden_states=hidden_states, ) else: @@ -1692,6 +1690,10 @@ class FusedMoE(CustomOp): ) def reduce_output(states: torch.Tensor) -> torch.Tensor: + # Slice before all_reduce to enable possible fusion + if self.hidden_size != og_hidden_states: + states = states[..., :og_hidden_states] + if ( not self.is_sequence_parallel and not self.use_dp_chunking @@ -1714,11 +1716,12 @@ class FusedMoE(CustomOp): if self.zero_expert_num is not None and self.zero_expert_num > 0: assert isinstance(fused_output, tuple) fused_output, zero_expert_result = fused_output - return (reduce_output(fused_output) + zero_expert_result)[ - ..., :og_hidden_states - ] + return ( + reduce_output(fused_output) + + zero_expert_result[..., :og_hidden_states] + ) else: - return reduce_output(fused_output)[..., :og_hidden_states] + return reduce_output(fused_output) else: if current_platform.is_tpu(): # TODO: Once the OOM issue for the TPU backend is resolved, we @@ -1731,8 +1734,8 @@ class FusedMoE(CustomOp): hidden_states, router_logits, self.layer_name ) return ( - reduce_output(shared_output)[..., :og_hidden_states], - reduce_output(fused_output)[..., :og_hidden_states], + reduce_output(shared_output), + reduce_output(fused_output), ) def forward_cuda( diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 63b0e6f573d65..48e5a8907f926 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -331,7 +331,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def forward_cuda( self, - layer: torch.nn.Module, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, use_grouped_topk: bool, top_k: int, @@ -352,31 +352,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - zero_expert_num = getattr(layer, "zero_expert_num", 0) - zero_expert_type = getattr(layer, "zero_expert_type", None) - topk_weights, topk_ids, zero_expert_result = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, - enable_eplb=enable_eplb, - expert_map=expert_map, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - global_num_experts=global_num_experts, - zero_expert_num=zero_expert_num, - zero_expert_type=zero_expert_type, - num_fused_shared_experts=layer.num_fused_shared_experts, ) if self.rocm_aiter_moe_enabled: @@ -415,7 +393,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): expert_map=expert_map, ) - if zero_expert_num != 0 and zero_expert_type is not None: + if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: assert not isinstance(result, tuple), ( "Shared + zero experts are mutually exclusive not yet supported" ) @@ -425,7 +403,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def forward_cpu( self, - layer: torch.nn.Module, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, use_grouped_topk: bool, top_k: int, @@ -474,7 +452,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def forward_xpu( self, - layer: torch.nn.Module, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, use_grouped_topk: bool, top_k: int, @@ -515,7 +493,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def forward_tpu( self, - layer: torch.nn.Module, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, use_grouped_topk: bool, top_k: int, diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 3f6ea68072b40..66945e2d2a7c8 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -597,7 +597,7 @@ class AWQMoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -618,24 +618,11 @@ class AWQMoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.") - assert activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) return fused_marlin_moe( diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index e5a741e639ad9..1e57fa218b797 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -495,7 +495,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -518,25 +518,11 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `BitsAndBytesMoEMethod` yet." - ) - - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) + # TODO(bnell): Do these need to be called on the hot path? if self.quant_config.load_in_8bit: w13, w2 = self._apply_8bit_dequant(layer) else: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index ad547dd409822..149e4419c64a4 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -511,7 +511,7 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -532,16 +532,17 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet." - ) assert activation == "silu", "Only SiLU activation is supported." if ( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet." + ) + return flashinfer_trtllm_fp4_moe( layer=layer, x=x, @@ -554,19 +555,9 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): e_score_correction_bias=e_score_correction_bias, ) - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) if self.use_marlin: @@ -1109,7 +1100,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -1130,31 +1121,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - assert expert_load_view is not None - assert logical_to_physical_map is not None - assert logical_replica_count is not None - assert isinstance(layer, FusedMoE) - - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, - num_fused_shared_experts=layer.num_fused_shared_experts, - enable_eplb=enable_eplb, - expert_map=expert_map, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, ) per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN @@ -1377,7 +1346,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -1398,26 +1367,11 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet." - ) - from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) return fused_experts( @@ -1738,7 +1692,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -1759,26 +1713,11 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet." - ) - assert activation == "silu", f"{activation} not supported for Marlin MoE." - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) return fused_marlin_moe( @@ -2001,7 +1940,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -2022,43 +1961,11 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - if expert_load_view is None: - raise ValueError("enable_eplb=True requiere expert_load_view != None") - if logical_to_physical_map is None: - raise ValueError( - "enable_eplb=True requiere logical_to_physical_map != None" - ) - if logical_replica_count is None: - raise ValueError( - "enable_eplb=True requiere logical_replica_count != None" - ) - if not isinstance(layer, FusedMoE): - raise TypeError( - "EPLB is only supported when `layer` is a instance of FusedMoE." - ) - from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, - num_fused_shared_experts=getattr(layer, "num_fused_shared_experts", 0), - enable_eplb=enable_eplb, - expert_map=expert_map, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, ) return fused_experts( diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 5241f9a2301be..7ebe40ec84687 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -137,7 +137,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -158,26 +158,11 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `ExpertsInt8MoEMethod` yet." - ) - from vllm.model_executor.layers.fused_moe import fused_experts - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) return fused_experts( diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 91bd45bf879cb..e033032903e87 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -28,6 +28,7 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoeWeightScaleSupported, ) from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEParallelConfig, FusedMoEQuantConfig, RoutingMethodType, fp8_w8a8_moe_quant_config, @@ -118,7 +119,9 @@ class Fp8MoeBackend(Enum): TRITON = 6 -def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: +def get_fp8_moe_backend( + block_quant: bool, moe_parallel_config: FusedMoEParallelConfig +) -> Fp8MoeBackend: """ Select the primary FP8 MoE backend Note: Shape-specific fallbacks may still occur at runtime. @@ -159,8 +162,19 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: logger.info_once("Using Marlin backend for FP8 MoE") return Fp8MoeBackend.MARLIN - # deepGEMM on supported platforms with block-quantized weights - if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant: + # Determine if we should use DeepGEMM with block-quantized weights: + # - If explicitly set by user, respect their choice + # - If not explicitly set (default), disable when TP size is >= 8 + moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM + if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and moe_parallel_config.tp_size >= 8: + moe_use_deep_gemm = False + logger.info_once( + "DeepGEMM MoE is disabled by default when TP size is >= 8. " + "Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.", + scope="local", + ) + + if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant: if not has_deep_gemm(): logger.warning_once( "DeepGEMM backend requested but not available.", scope="local" @@ -641,7 +655,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): self.quant_config = quant_config self.weight_block_size = self.quant_config.weight_block_size self.block_quant: bool = self.weight_block_size is not None - self.fp8_backend = get_fp8_moe_backend(self.block_quant) + self.fp8_backend = get_fp8_moe_backend( + self.block_quant, layer.moe_parallel_config + ) self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN self.flashinfer_moe_backend: FlashinferMoeBackend | None = None @@ -1140,7 +1156,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -1216,31 +1232,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): apply_router_weight_on_input=apply_router_weight_on_input, ) - zero_expert_num = getattr(layer, "zero_expert_num", 0) - zero_expert_type = getattr(layer, "zero_expert_type", None) - - select_result = FusedMoE.select_experts( + select_result = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, - enable_eplb=enable_eplb, - expert_map=expert_map, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - global_num_experts=global_num_experts, - zero_expert_num=zero_expert_num, - zero_expert_type=zero_expert_type, - num_fused_shared_experts=layer.num_fused_shared_experts, ) topk_weights, topk_ids, zero_expert_result = select_result @@ -1322,7 +1316,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): self.allow_cutlass_block_scaled_grouped_gemm ), ) - if zero_expert_num != 0 and zero_expert_type is not None: + + if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: assert not isinstance(result, tuple), ( "Shared + zero experts are mutually exclusive not yet supported" ) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 42d7a67371ae8..bcdfafb50fc5a 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -621,7 +621,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -642,9 +642,6 @@ class GGUFMoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.") - assert activation == "silu", "Only SiLU activation is supported." if apply_router_weight_on_input: raise NotImplementedError( @@ -652,19 +649,9 @@ class GGUFMoEMethod(FusedMoEMethodBase): "fused GGUF MoE method." ) - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) return fused_moe_gguf( x, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 68a122fd46c6b..77b15db373a3a 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -722,7 +722,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -743,26 +743,11 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `GPTQMarlinMoEMethod` yet." - ) - assert activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) return fused_marlin_moe( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 01a23168bdde3..8165673135910 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -696,7 +696,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -717,12 +717,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `ModelOptFp8MoEMethod` yet." - ) - if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + if layer.enable_eplb: + raise NotImplementedError( + "EPLB not supported for `ModelOptFp8MoEMethod` yet." + ) assert activation == "silu", ( f"Expected 'silu' activation but got {activation}" ) @@ -740,19 +739,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) # Expert selection - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: @@ -1459,7 +1448,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -1480,16 +1469,16 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `ModelOptNvFp4FusedMoE` yet." - ) assert activation == "silu", "Only SiLU activation is supported." if ( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): + if enable_eplb: + raise NotImplementedError( + "EPLB not supported for `ModelOptNvFp4FusedMoE` yet." + ) return flashinfer_trtllm_fp4_moe( layer=layer, x=x, @@ -1502,19 +1491,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): e_score_correction_bias=e_score_correction_bias, ) - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) if self.use_marlin: diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 2090c86f78dc8..cf348290a2716 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -359,7 +359,7 @@ class MoeWNA16Method(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -380,25 +380,12 @@ class MoeWNA16Method(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.") - from vllm.model_executor.layers.fused_moe import fused_experts assert activation == "silu", "Only SiLU activation is supported." - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) return fused_experts( diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 66ae2e94c60a5..198feb03be3e4 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -132,12 +132,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: ) # If FlashInfer is not available, try either Marlin or Triton - if ( - envs.VLLM_MXFP4_USE_MARLIN - or current_platform.get_device_capability()[0] < 9 - or not has_triton_kernels() - or not is_torch_equal_or_newer("2.8.0") - ): + triton_kernels_supported = ( + has_triton_kernels() + and is_torch_equal_or_newer("2.8.0") + # NOTE: triton_kernels are only confirmed to work on SM90 and SM100 + # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317 + # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498 + and (9, 0) <= current_platform.get_device_capability() < (11, 0) + ) + if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported: logger.info_once("Using Marlin backend") return Mxfp4Backend.MARLIN else: @@ -862,7 +865,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -887,18 +890,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): raise NotImplementedError("EPLB is not supported for mxfp4") if self.mxfp4_backend == Mxfp4Backend.MARLIN: - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, ) return fused_marlin_moe( @@ -989,17 +983,9 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): ): from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias, ) # Backend-specific preparation diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 30772c3665b06..8be0299eaa66f 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -334,7 +334,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -355,24 +355,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet." - ) - - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) if self.rocm_aiter_moe_enabled: @@ -609,7 +594,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -630,24 +615,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet." - ) - - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) if not self.emulate: diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 52656263a601b..7b51b828009fc 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -356,7 +356,7 @@ class RTNMoEMethod(FusedMoEMethodBase): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, top_k: int, @@ -377,22 +377,9 @@ class RTNMoEMethod(FusedMoEMethodBase): logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.") - - topk_weights, topk_ids, _ = FusedMoE.select_experts( + topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, ) return fused_marlin_moe( diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py index 152d9401b8e94..0f10bff6ac4f5 100644 --- a/vllm/model_executor/layers/rotary_embedding/__init__.py +++ b/vllm/model_executor/layers/rotary_embedding/__init__.py @@ -17,6 +17,7 @@ from .llama4_vision_rope import Llama4VisionRotaryEmbedding from .mrope import MRotaryEmbedding from .ntk_scaling_rope import NTKScalingRotaryEmbedding from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding +from .xdrope import XDRotaryEmbedding from .yarn_scaling_rope import YaRNScalingRotaryEmbedding _ROPE_DICT: dict[tuple, RotaryEmbedding] = {} @@ -184,6 +185,18 @@ def get_rope( raise ValueError( "Dynamic rope scaling must contain either 'alpha' or 'factor' field" ) + elif scaling_type == "xdrope": + scaling_alpha = rope_parameters["alpha"] + rotary_emb = XDRotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_alpha, + dtype, + xdrope_section=rope_parameters["xdrope_section"], + ) elif scaling_type == "yarn": scaling_factor = rope_parameters["factor"] original_max_position = rope_parameters["original_max_position_embeddings"] diff --git a/vllm/model_executor/layers/rotary_embedding/xdrope.py b/vllm/model_executor/layers/rotary_embedding/xdrope.py new file mode 100644 index 0000000000000..2432273faf195 --- /dev/null +++ b/vllm/model_executor/layers/rotary_embedding/xdrope.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import numpy as np +import torch + +from .common import apply_rotary_emb_dispatch +from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding + + +class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding): + """DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections. + + Based on the original DynamicNTKAlphaRotaryEmbedding implementation. + """ + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + scaling_alpha: float, + dtype: torch.dtype, + xdrope_section: list[int], + ) -> None: + self.xdrope_section = xdrope_section + super().__init__( + head_size, + rotary_dim, + max_position_embeddings, + base, + is_neox_style, + scaling_alpha, + dtype, + ) + + def forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + offsets: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """PyTorch-native implementation equivalent to forward(). + + Args: + positions: + [4, num_tokens] (P/W/H/T positions with multimodal inputs) + query: [num_tokens, num_heads * head_size] + key: [num_tokens, num_kv_heads * head_size] + """ + assert positions.ndim == 2 + assert key is not None + + num_tokens = positions.shape[-1] + cos_sin = self.cos_sin_cache[positions] + cos, sin = cos_sin.chunk(2, dim=-1) + cos = torch.cat( + [m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1 + ) + sin = torch.cat( + [m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1 + ) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., : self.rotary_dim] + query_pass = query[..., self.rotary_dim :] + query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., : self.rotary_dim] + key_pass = key[..., self.rotary_dim :] + key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + @staticmethod + def get_next_input_positions( + context_len: int, + seq_len: int, + xd_sections: int = 4, + ) -> list[list[int]]: + return [list(range(context_len, seq_len)) for _ in range(xd_sections)] + + @staticmethod + def get_next_input_positions_tensor( + out: np.ndarray, + out_offset: int, + context_len: int, + num_new_tokens: int, + ): + values = np.arange( + context_len, + context_len + num_new_tokens, + dtype=out.dtype, + ) + out[:, out_offset : out_offset + num_new_tokens] = values diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 9fa5e2bd33f21..53fb444ed622d 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -576,7 +576,16 @@ class HunYuanDecoderLayer(nn.Module): return hidden_states, residual, ori_kv_states -@support_torch_compile +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (xd, seq_len) if xdrope is enabled for hunyuan-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + } +) class HunYuanModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py new file mode 100644 index 0000000000000..e83addd0c092f --- /dev/null +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -0,0 +1,1028 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# coding=utf-8 +# Copyright 2025 The HunYuan team. +# Copyright 2025 The vLLM team. +# Copyright 2025 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only HunYuan-VL model compatible with HuggingFace weights.""" + +from collections.abc import Callable, Iterable, Mapping, Sequence +from functools import partial +from typing import Annotated, Any, Literal, TypeAlias + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import BatchFeature + +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.layer import MultiHeadAttention +from vllm.config import MultiModalConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + ImageItem, + ModalityData, + MultiModalDataDict, + MultiModalFeatureSpec, + MultiModalFieldConfig, + MultiModalKwargsItems, +) +from vllm.multimodal.parse import ( + DictEmbeddingItems, + ImageSize, + MultiModalDataItems, + MultiModalDataParser, +) +from vllm.multimodal.processing import ( + BaseMultiModalProcessor, + BaseProcessingInfo, + PromptReplacement, + PromptUpdate, +) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.hunyuan_vl import ( + HunYuanVLConfig, + HunYuanVLVisionConfig, +) +from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor +from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +from .interfaces import ( + MultiModalEmbeddings, + SupportsLoRA, + SupportsMultiModal, + SupportsPP, + SupportsQuant, + SupportsXDRoPE, +) +from .utils import ( + AutoWeightsLoader, + WeightsMapper, + init_vllm_registered_model, + maybe_prefix, +) + +logger = init_logger(__name__) + +# === Vision Inputs === # + + +class HunYuanVLImagePixelInputs(TensorSchema): + """ + Dimensions: + - np: Number of patches + - ni: Number of images + - cps: Number of channels * patch_size * patch_size + """ + + type: Literal["pixel_values"] + + pixel_values: Annotated[ + torch.Tensor, + TensorShape("np", "cps"), + ] + + image_grid_thw: Annotated[ + torch.Tensor, + TensorShape("ni", 3), + ] + + +class HunYuanVLImageEmbeddingInputs(TensorSchema): + """ + Dimensions: + - nf: Number of image features + - hs: Hidden size + - ni: Number of images + """ + + type: Literal["image_embeds"] + + image_embeds: Annotated[ + torch.Tensor, + TensorShape("nf", "hs"), + ] + + image_grid_thw: Annotated[ + torch.Tensor, + TensorShape("ni", 3), + ] + + +HunYuanVLImageInputs: TypeAlias = ( + HunYuanVLImagePixelInputs | HunYuanVLImageEmbeddingInputs +) + +# === Vision Encoder === # + + +class HunYuanVisionMLP(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: int, + bias: bool = True, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_data_parallel: bool = False, + ): + super().__init__() + self.dense_h_to_4h = ColumnParallelLinear( + in_features, + hidden_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.dense_h_to_4h", + disable_tp=use_data_parallel, + ) + self.dense_4h_to_h = RowParallelLinear( + hidden_features, + in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.dense_4h_to_h", + disable_tp=use_data_parallel, + ) + self.act_fn = act_fn + + def forward(self, x: torch.Tensor): + x_up, _ = self.dense_h_to_4h(x) + x_down, _ = self.dense_4h_to_h(self.act_fn(x_up)) + return x_down + + +class HunYuanVisionAttention(nn.Module): + def __init__( + self, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: QuantizationConfig | None = None, + multimodal_config: MultiModalConfig | None = None, + prefix: str = "", + use_data_parallel: bool = False, + ) -> None: + super().__init__() + # Per attention head and per partition values. + self.tp_size = ( + 1 + if use_data_parallel + else parallel_state.get_tensor_model_parallel_world_size() + ) + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads + ) + self.num_attention_heads_per_partition = dist_utils.divide( + num_heads, self.tp_size + ) + + self.qkv = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.hidden_size_per_attention_head, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + disable_tp=use_data_parallel, + ) + + self.o_proj = RowParallelLinear( + input_size=projection_size, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + disable_tp=use_data_parallel, + ) + + self.scale = self.hidden_size_per_attention_head**-0.5 + self.attn = MultiHeadAttention( + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + self.scale, + prefix=f"{prefix}.attn", + multimodal_config=multimodal_config, + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv(x) + q, k, v = qkv.chunk(3, dim=-1) + out = self.attn(q, k, v) + output, _ = self.o_proj(out) + return output + + +class HunYuanVisionBlock(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu, + norm_layer: Callable[[int], nn.Module] | None = None, + quant_config: QuantizationConfig | None = None, + multimodal_config: MultiModalConfig | None = None, + prefix: str = "", + use_data_parallel: bool = False, + ) -> None: + super().__init__() + if norm_layer is None: + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.input_layernorm = norm_layer(dim) + self.post_attention_layernorm = norm_layer(dim) + self.self_attn = HunYuanVisionAttention( + embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.self_attn", + use_data_parallel=use_data_parallel, + ) + self.mlp = HunYuanVisionMLP( + dim, + mlp_hidden_dim, + act_fn=act_fn, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel, + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + x = x + self.self_attn(self.input_layernorm(x)) + x = x + self.mlp(self.post_attention_layernorm(x)) + return x + + +class HunYuanVisionPatchEmbed(nn.Module): + def __init__(self, config: HunYuanVLVisionConfig): + super().__init__() + + self.config = config + self.embed_dim = config.hidden_size + self.patch_size = config.patch_size + self.num_channels = config.num_channels + self.spatial_merge_size = config.spatial_merge_size + self.interpolate_mode = config.interpolate_mode + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=True, + ) + + self.max_num_patches = (config.max_image_size // self.patch_size) ** 2 + + self.num_positions = self.max_num_patches + 1 + self.position_edge = int(self.num_positions**0.5) + # first token is cls token, skip it + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + + self.patch_pos_embed = None + + def forward( + self, pixel_values: torch.Tensor, grid_thw: list[list[int]] + ) -> torch.Tensor: + num_patches = pixel_values.size(0) + pixel_values = pixel_values.reshape( + num_patches, self.num_channels, self.patch_size, self.patch_size + ) + + patch_embeds = self.patch_embedding(pixel_values) + patch_embeds = patch_embeds.squeeze(-1).squeeze(-1).unsqueeze(0) + + if self.patch_pos_embed is None: + patch_pos_shape = ( + 1, + self.position_edge, + self.position_edge, + self.embed_dim, + ) + self.patch_pos_embed = ( + self.position_embedding.weight[1:, :] + .reshape(patch_pos_shape) + .permute(0, 3, 1, 2) + .float() + ) + + patch_pos_embed_list = [] + for grid in grid_thw: + _, h0, w0 = grid + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + h0, w0 = h0 + 0.1, w0 + 0.1 + patch_pos_embed = nn.functional.interpolate( + self.patch_pos_embed, + scale_factor=(h0 / self.position_edge, w0 / self.position_edge), + mode=self.interpolate_mode, + align_corners=False, + ) + + patch_pos_embed = ( + patch_pos_embed.reshape(self.embed_dim, -1) + .transpose(0, 1) + .unsqueeze(0) + .to(patch_embeds.dtype) + ) + patch_pos_embed_list.append(patch_pos_embed) + + patch_pos_embed = torch.cat(patch_pos_embed_list, dim=1) + embeddings = patch_embeds + patch_pos_embed + + return embeddings + + +class HunYuanVisionPatchMerger(nn.Module): + def __init__( + self, + in_channels, + out_channels, + spatial_merge_size=2, + rms_norm_eps=1e-5, + prefix="", + ): + super().__init__() + self.spatial_merge_size = spatial_merge_size + embed_std = out_channels**-0.5 + + self.proj = nn.Sequential( + nn.Conv2d( + in_channels, + in_channels * 2, + kernel_size=spatial_merge_size, + stride=spatial_merge_size, + ), + nn.GELU(), + nn.Conv2d(in_channels * 2, in_channels * 4, kernel_size=1), + ) + self.mlp = nn.Linear(in_channels * 4, out_channels) + + self.image_newline = nn.Parameter(torch.randn(in_channels * 4) * embed_std) + self.image_begin = nn.Parameter(torch.randn(out_channels) * embed_std) + self.image_end = nn.Parameter(torch.randn(out_channels) * embed_std) + self.image_sep = nn.Parameter(torch.randn(out_channels) * embed_std) + + self.before_rms = RMSNorm(in_channels, eps=rms_norm_eps) + self.after_rms = RMSNorm(out_channels, eps=rms_norm_eps) + + def forward(self, x, size=(16, 16)): + x = self.before_rms(x) + + h, w = size + dtype = x.dtype + x = x.permute(0, 2, 1).reshape(x.shape[0], -1, h, w) + + x = self.proj(x) # b,c,h,w + b, c, h, w = x.shape + x = torch.cat( + [x, self.image_newline.reshape(1, c, 1, 1).expand(b, c, h, 1).to(dtype)], + dim=-1, + ) + x = x.reshape(b, c, -1).permute(0, 2, 1) + x = self.mlp(x) + + begin = self.image_begin.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype) + end = self.image_end.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype) + x = torch.cat([begin, x, end], dim=1) + + return self.after_rms(x) + + +class HunYuanVisionTransformer(nn.Module): + def __init__( + self, + vision_config: HunYuanVLVisionConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_data_parallel: bool = False, + multimodal_config: MultiModalConfig | None = None, + attn_backend_override: AttentionBackendEnum | None = None, + ) -> None: + super().__init__() + + num_hidden_layers = vision_config.num_hidden_layers + self.hidden_size = vision_config.hidden_size + self.num_heads = vision_config.num_attention_heads + self.spatial_merge_size = vision_config.spatial_merge_size + + from vllm.compilation.backends import set_model_tag + + with set_model_tag("HunYuanVisionPatchEmbed"): + self.embeddings = HunYuanVisionPatchEmbed(vision_config) + + norm_layer = partial(nn.LayerNorm, eps=vision_config.rms_norm_eps) + + with set_model_tag("HunYuanVisionBlock"): + self.layers = nn.ModuleList( + [ + HunYuanVisionBlock( + dim=vision_config.hidden_size, + num_heads=vision_config.num_attention_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=get_act_fn(vision_config.hidden_act), + norm_layer=norm_layer, + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=f"{prefix}.layers.{layer_idx}", + use_data_parallel=use_data_parallel, + ) + for layer_idx in range(num_hidden_layers) + ] + ) + + with set_model_tag("HunYuanVisionPatchMerger"): + self.perceive = HunYuanVisionPatchMerger( + vision_config.hidden_size, + vision_config.out_hidden_size, + spatial_merge_size=vision_config.spatial_merge_size, + rms_norm_eps=vision_config.rms_norm_eps, + prefix=f"{prefix}.perceive", + ) + + @property + def dtype(self) -> torch.dtype: + return self.embeddings.patch_embedding.weight.dtype + + @property + def device(self) -> torch.device: + return self.embeddings.patch_embedding.weight.device + + def forward( + self, + x: torch.Tensor, + grid_thw: list[list[int]], + ) -> torch.Tensor: + # patchify + seq_len = x.size(0) + cu_seqlens: list = [0] + + hidden_states = x.to(device=self.device, dtype=self.dtype) + hidden_states = self.embeddings(hidden_states, grid_thw) + + for t, h, w in grid_thw: + t, h, w = int(t), int(h), int(w) + cu_seqlens.append(h * w) + + cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32) + cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32) + + cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True) + + hidden_states = hidden_states.reshape(seq_len, -1) + hidden_states = hidden_states.unsqueeze(0) + for layer_num, layer in enumerate(self.layers): + hidden_states = layer(hidden_states) + + # adapter + split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + split_items = hidden_states.split(split_lengths, dim=1) + image_embeds_list = [] + for grid, split_item in zip(grid_thw, split_items): + image_embeds_list.append( + self.perceive(split_item.contiguous(), size=grid[1:]).squeeze(0) + ) + + return image_embeds_list + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv", ".q_proj", "q"), + (".qkv", ".k_proj", "k"), + (".qkv", ".v_proj", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]): + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_grid_sizes = image_grid_thw.prod(-1) + return dict( + pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), + image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), + image_grid_thw=MultiModalFieldConfig.batched("image"), + ) + + +class HunYuanVLMultiModalDataParser(MultiModalDataParser): + def _parse_image_data( + self, + data: dict[str, torch.Tensor] | ModalityData[ImageItem], + ): + if isinstance(data, dict): + return DictEmbeddingItems( + data, + modality="image", + required_fields={"image_embeds", "image_grid_thw"}, + fields_factory=_hunyuan_vl_field_config, + ) + + return super()._parse_image_data(data) + + +class HunYuanVLProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config(HunYuanVLConfig) + + def get_hf_processor( + self, + **kwargs: object, + ) -> HunYuanVLProcessor: + return self.ctx.get_hf_processor( + HunYuanVLProcessor, + use_fast=kwargs.pop("use_fast", True), + **kwargs, + ) + + def get_image_processor( + self, + **kwargs: object, + ) -> HunYuanVLProcessor: + return self.get_hf_processor(**kwargs).image_processor + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + return {"image": None} + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + max_image_tokens = self.get_max_image_tokens() + # TODO: support video + max_video_tokens = 0 + return {"image": max_image_tokens, "video": max_video_tokens} + + def _get_vision_info( + self, + *, + image_width: int, + image_height: int, + num_frames: int = 1, + do_resize: bool = True, + image_processor: HunYuanVLProcessor | None, + ) -> tuple[ImageSize, int]: + if image_processor is None: + image_processor = self.get_image_processor() + + hf_config = self.get_hf_config() + vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + spatial_merge_size = vision_config.spatial_merge_size + + if do_resize: + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * spatial_merge_size, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + ) + preprocessed_size = ImageSize(width=resized_width, height=resized_height) + else: + preprocessed_size = ImageSize(width=image_width, height=image_height) + + grid_t = 1 + grid_h = preprocessed_size.height // patch_size + grid_w = preprocessed_size.width // patch_size + + num_vision_tokens = ( + grid_t * grid_h // spatial_merge_size * (grid_w // spatial_merge_size + 1) + + 2 + ) + + return preprocessed_size, num_vision_tokens + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + image_processor: HunYuanVLProcessor | None, + ) -> int: + _, num_image_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + image_processor=image_processor, + ) + return num_image_tokens + + def get_image_size_with_most_features(self) -> ImageSize: + max_image_size, _ = self._get_vision_info( + image_width=512, + image_height=8192, + image_processor=None, + ) + return max_image_size + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + image_processor=None, + ) + + +class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + hf_processor = self.info.get_hf_processor() + image_token: str = hf_processor.image_token + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 1) + + target_width, target_height = self.info.get_image_size_with_most_features() + + return { + "image": self._get_dummy_images( + width=target_width, height=target_height, num_images=num_images + ), + } + + +class HunYuanVLMultiModalProcessor(BaseMultiModalProcessor[HunYuanVLProcessingInfo]): + def _get_data_parser(self) -> MultiModalDataParser: + return HunYuanVLMultiModalDataParser() + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + return self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + dict(**mm_kwargs, **tok_kwargs), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) + + placeholder = { + "image": hf_processor.image_token_id, + } + + merge_size = image_processor.merge_size + + def get_replacement_hunyuan_vl(item_idx: int, modality: str): + out_item = out_mm_kwargs[modality][item_idx] + grid_thw = out_item[f"{modality}_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + + _, grid_h, grid_w = grid_thw + num_tokens = (int(grid_h) // merge_size) * ( + int(grid_w) // merge_size + 1 + ) + 2 + return [placeholder[modality]] * num_tokens + + return [ + PromptReplacement( + modality=modality, + target=[placeholder[modality]], + replacement=partial(get_replacement_hunyuan_vl, modality=modality), + ) + for modality in ("image",) + ] + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return _hunyuan_vl_field_config(hf_inputs) + + +@MULTIMODAL_REGISTRY.register_processor( + HunYuanVLMultiModalProcessor, + info=HunYuanVLProcessingInfo, + dummy_inputs=HunYuanVLDummyInputsBuilder, +) +class HunYuanVLForConditionalGeneration( + nn.Module, + SupportsMultiModal, + SupportsLoRA, + SupportsPP, + SupportsQuant, + SupportsXDRoPE, +): + multimodal_cpu_fields = {"image_grid_thw"} + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers v4.52 + "vit.vit.": "visual.", + "vit.": "visual.", + "model.": "language_model.model.", + } + ) + + supports_encoder_tp_data = True + + def get_xdrope_input_positions( + self, + input_tokens: list[int], + mm_features: list[MultiModalFeatureSpec], + ) -> torch.Tensor: + kwargs = MultiModalFeatureSpec.gather_kwargs( + mm_features, + {"image_grid_thw"}, + ) + image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])] + + hf_config = self.config + image_start_token_id = hf_config.image_start_token_id + spatial_merge_size = hf_config.vision_config.spatial_merge_size + xd_num = len(hf_config.rope_scaling["xdrope_section"]) + + input_tokens_tensor = torch.tensor(input_tokens) + image_start_indices = torch.argwhere( + input_tokens_tensor == image_start_token_id + ).squeeze(1) + + p_index = torch.arange(len(input_tokens_tensor)) + w_index = torch.arange(len(input_tokens_tensor)) + h_index = torch.arange(len(input_tokens_tensor)) + t_index = torch.arange(len(input_tokens_tensor)) + for image_index in range(len(image_start_indices)): + # +1 : first image_token, +2: for xdrope positions + pos = image_start_indices[image_index] + 2 + t, h, w = image_grid_thw[image_index] + _, llm_grid_h, llm_grid_w = ( + t, + h // spatial_merge_size, + w // spatial_merge_size, + ) + + token_num = (llm_grid_w + 1) * llm_grid_h + w_index[pos : pos + token_num].copy_( + torch.arange(0, llm_grid_w + 1) + .reshape(1, -1) + .expand(llm_grid_h, -1) + .reshape(-1) + ) + h_index[pos : pos + token_num].copy_( + torch.arange(0, llm_grid_h) + .reshape(-1, 1) + .expand(-1, llm_grid_w + 1) + .reshape(-1) + ) + h_index[pos : pos + token_num] = 0 + + if xd_num == 4: + llm_positions = torch.stack([p_index, w_index, h_index, t_index]) + elif xd_num == 3: + llm_positions = torch.stack([w_index, h_index, t_index]) + + return llm_positions + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + if modality.startswith("image"): + return "<|hy_place▁holder▁no▁100|><|hy_place▁holder▁no▁102|><|hy_place▁holder▁no▁101|>" # noqa: E501 + + raise ValueError("Only image modality is supported") + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: HunYuanVLConfig = vllm_config.model_config.hf_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + if multimodal_config.get_limit_per_prompt("image"): + attn_backend_override = ( + multimodal_config.mm_encoder_attn_backend + if multimodal_config is not None + else None + ) + self.visual = HunYuanVisionTransformer( + config.vision_config, + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "visual"), + multimodal_config=multimodal_config, + attn_backend_override=attn_backend_override, + ) + else: + self.visual = None + + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model.model"), + architectures=[ + "HunYuanDenseV1ForCausalLM", + "HunYuanMoEV1ForCausalLM", + ], + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) + + def _parse_and_validate_image_input( + self, **kwargs: object + ) -> HunYuanVLImageInputs | None: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + image_grid_thw = kwargs.pop("image_grid_thw", None) + + if pixel_values is None and image_embeds is None: + return None + + # TODO: refine + if isinstance(pixel_values, list): + pixel_values = torch.cat(pixel_values, dim=0) + if len(pixel_values.shape) == 3: + last_dim = pixel_values.shape[-1] + pixel_values = pixel_values.reshape(-1, last_dim) + image_grid_thw = image_grid_thw.reshape(-1, 3) + + if pixel_values is not None: + return HunYuanVLImagePixelInputs( + type="pixel_values", + pixel_values=pixel_values, + image_grid_thw=image_grid_thw, + ) + + if image_embeds is not None: + return HunYuanVLImageEmbeddingInputs( + type="image_embeds", + image_embeds=image_embeds, + image_grid_thw=image_grid_thw, + ) + + def _process_image_input( + self, image_input: HunYuanVLImageInputs + ) -> tuple[torch.Tensor, ...]: + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"] + + # TODO: use_data_parallel (split image_embeds in visual) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) + + return image_embeds + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + mm_input_by_modality = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if ( + input_key in ("pixel_values", "image_embeds") + and "image" not in mm_input_by_modality + ): + mm_input_by_modality["image"] = self._parse_and_validate_image_input( + **kwargs + ) + return mm_input_by_modality + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: + mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) + if not mm_input_by_modality: + return [] + + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in mm_input_by_modality: + multimodal_input = mm_input_by_modality[modality] + if modality == "image": + image_embeddings = self._process_image_input(multimodal_input) + multimodal_embeddings += tuple(image_embeddings) + return multimodal_embeddings + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None, + inputs_embeds: torch.Tensor | None, + **kwargs: object, + ) -> torch.Tensor | IntermediateTensors: + if intermediate_tensors is not None: + inputs_embeds = None + + hidden_states = self.language_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + return self.language_model.compute_logits(hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model.model", + connector="visual.perceive", + tower_model="visual", + ) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 9966498e1b4c9..6f6ce32538b71 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1047,7 +1047,7 @@ class SupportsMRoPE(Protocol): supports_mrope: ClassVar[Literal[True]] = True """ A flag that indicates this model supports M-RoPE. - + Note: There is no need to redefine this flag if this class is in the MRO of your model class. @@ -1088,3 +1088,52 @@ def supports_mrope( model: type[object] | object, ) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]: return isinstance(model, SupportsMRoPE) + + +@runtime_checkable +class SupportsXDRoPE(Protocol): + """The interface required for all models that support XD-RoPE.""" + + supports_xdrope: ClassVar[Literal[True]] = True + """ + A flag that indicates this model supports XD-RoPE. + + Note: + There is no need to redefine this flag if this class is in the + XDRope of your model class. + """ + + def get_xdrope_input_positions( + self, + input_tokens: list[int], + mm_features: list["MultiModalFeatureSpec"], + ) -> torch.Tensor: + """ + Get XD-RoPE input positions and delta value for this specific model. + + This method should be implemented by each model that supports XD-RoPE + to provide model-specific logic for computing input positions. + + Args: + input_tokens: List of input token IDs + mm_features: Information about each multi-modal data item + + Returns: + llm_positions: Tensor of shape `[xdrope_dim, num_tokens]` with + 4D(P/W/H/T) or 3D(W/H/T) positions. + """ + ... + + +@overload +def supports_xdrope(model: type[object]) -> TypeIs[type[SupportsXDRoPE]]: ... + + +@overload +def supports_xdrope(model: object) -> TypeIs[SupportsXDRoPE]: ... + + +def supports_xdrope( + model: type[object] | object, +) -> TypeIs[type[SupportsXDRoPE]] | TypeIs[SupportsXDRoPE]: + return isinstance(model, SupportsXDRoPE) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 3eaf2d80082f1..7a57644db1b13 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -142,6 +142,12 @@ class LlamaModel(nn.Module): # Get drafter's quantization config self.quant_config = get_draft_quant_config(vllm_config) + eagle_config = getattr(self.config, "eagle_config", None) + if eagle_config is not None and "use_aux_hidden_state" in eagle_config: + self.use_aux_hidden_state = eagle_config["use_aux_hidden_state"] + else: + self.use_aux_hidden_state = True + current_vllm_config = get_current_vllm_config() self.embed_tokens = VocabParallelEmbedding( @@ -161,20 +167,20 @@ class LlamaModel(nn.Module): for layer_idx in range(self.config.num_hidden_layers) ] ) - if hasattr(self.config, "target_hidden_size"): - fc_input_size = self.config.target_hidden_size * 3 - else: - fc_input_size = self.config.hidden_size * 3 - self.fc = ReplicatedLinear( - input_size=fc_input_size, - output_size=self.config.hidden_size, - bias=False, - params_dtype=vllm_config.model_config.dtype, - quant_config=self.quant_config, - prefix=maybe_prefix(prefix, "fc"), - return_bias=False, - ) - + if self.use_aux_hidden_state: + if hasattr(self.config, "target_hidden_size"): + fc_input_size = self.config.target_hidden_size * 3 + else: + fc_input_size = self.config.hidden_size * 3 + self.fc = ReplicatedLinear( + input_size=fc_input_size, + output_size=self.config.hidden_size, + bias=False, + params_dtype=vllm_config.model_config.dtype, + quant_config=self.quant_config, + prefix=maybe_prefix(prefix, "fc"), + return_bias=False, + ) self.norm = RMSNorm( self.config.hidden_size, eps=self.config.rms_norm_eps, @@ -332,6 +338,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): self, hidden_states: torch.Tensor, ) -> torch.Tensor: + if not self.model.use_aux_hidden_state: + return hidden_states # combine multiple auxiliary hidden states returned by eagle3 return self.model.fc(hidden_states) @@ -357,6 +365,8 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): skip_substrs.append("draft_id_to_target_id") if not includes_embed_tokens: skip_substrs.append("embed_tokens") + if not self.model.use_aux_hidden_state: + skip_substrs.append("fc.") loader = AutoWeightsLoader( self, skip_prefixes=None, diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 2e3e6dc166ad8..63ea6b259a71d 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -56,10 +56,13 @@ from transformers.utils import is_flash_attn_2_available from vllm.model_executor.layers.conv import Conv2dLayer from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.models.utils import maybe_prefix +from vllm.platforms import current_platform from vllm.transformers_utils.configs.moonvit import MoonViTConfig if is_flash_attn_2_available(): from flash_attn import flash_attn_varlen_func +elif current_platform.is_xpu(): + from vllm.attention.utils.fa_utils import flash_attn_varlen_func else: flash_attn_varlen_func = None @@ -106,10 +109,10 @@ def multihead_attention( q, k, v, - q_cu_seqlens, - k_cu_seqlens, - max_seqlen_q, - max_seqlen_k, + cu_seqlens_q=q_cu_seqlens, + cu_seqlens_k=k_cu_seqlens, + max_seqlen_q=max_seqlen_q, + max_seqlen_k=max_seqlen_k, causal=False, ) attn_out = attn_out.flatten(start_dim=-2) @@ -291,7 +294,12 @@ class Rope2DPosEmb(nn.Module): """ def __init__( - self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda" + self, + dim: int, + max_height: int, + max_width: int, + theta_base=10000, + device=current_platform.device_type, ): super().__init__() self.dim = dim @@ -437,7 +445,7 @@ class MoonVitEncoderLayer(nn.Module): self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads self.attn_implementation = attn_implementation # use fa2 in vllm by default - if is_flash_attn_2_available(): + if is_flash_attn_2_available() or current_platform.is_xpu(): self.attn_implementation = "flash_attention_2" self.norm0 = nn.LayerNorm(hidden_dim) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 262ea771d9cdf..7506ee8656fda 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -23,7 +23,6 @@ """Inference-only Qwen2.5-Omni model (thinker part).""" from collections.abc import Callable, Iterable, Mapping, Sequence -from copy import copy from functools import partial from typing import Annotated, Any, Literal @@ -387,15 +386,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor( self._validate_mm_kwargs(mm_kwargs, mm_item_counts) self._validate_mm_updates(mm_prompt_updates, mm_item_counts) - use_audio_in_video = False - if "video" in mm_kwargs: - video_items = [item for item in mm_kwargs["video"] if item is not None] - # only check video items (if there are any) - if video_items: - use_audio_in_video = all( - item["use_audio_in_video"].data for item in video_items - ) - if is_update_applied: mm_placeholders = self._find_mm_placeholders( prompt_ids, @@ -404,7 +394,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor( self._validate_mm_placeholders( mm_placeholders, mm_item_counts, - use_audio_in_video=use_audio_in_video, ) else: prompt_ids, mm_placeholders = self._apply_prompt_updates( @@ -414,7 +403,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor( self._validate_mm_placeholders( mm_placeholders, mm_item_counts, - use_audio_in_video=use_audio_in_video, ) return prompt_ids, mm_placeholders @@ -640,19 +628,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor( return mm_processed_data - def _validate_mm_placeholders( - self, - mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], - mm_item_counts: Mapping[str, int], - use_audio_in_video: bool = False, - ) -> None: - if use_audio_in_video: - mm_item_counts = copy(mm_item_counts) - if "video" in mm_item_counts: - assert "audio" in mm_item_counts - mm_item_counts["audio"] -= mm_item_counts["video"] - super()._validate_mm_placeholders(mm_placeholders, mm_item_counts) - class Qwen2_5OmniConditionalGenerationMixin: def _parse_and_validate_audio_input( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 61f218f16d79c..f5f88f66eff91 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -68,11 +68,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems from vllm.multimodal.processing import ( - BaseMultiModalProcessor, MultiModalPromptUpdates, PlaceholderFeaturesInfo, PromptReplacement, PromptUpdate, + PromptUpdateDetails, ) from vllm.sequence import IntermediateTensors @@ -87,7 +87,6 @@ from .qwen2_5_omni_thinker import ( Qwen2_5OmniConditionalGenerationMixin, Qwen2_5OmniThinkerDummyInputsBuilder, Qwen2_5OmniThinkerMultiModalProcessor, - Qwen2_5OmniThinkerProcessingInfo, ) from .qwen2_5_vl import ( Qwen2_5_VisionAttention, @@ -807,24 +806,8 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( else: use_audio_in_video = False - if use_audio_in_video and "video" in mm_item_counts: - assert "audio" in mm_item_counts - mm_item_counts["audio"] -= mm_item_counts["video"] - - # Special case with `use_audio_in_video=True` - if use_audio_in_video: - if is_update_applied: - prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video) - ( - prompt_ids, - mm_placeholders, - ) = self._apply_prompt_updates( - prompt_ids, - mm_prompt_updates, - ) - self._validate_mm_placeholders(mm_placeholders, mm_item_counts) # normal case with `use_audio_in_video=False` - elif is_update_applied: + if is_update_applied: mm_placeholders = self._find_mm_placeholders( prompt_ids, mm_prompt_updates, @@ -834,10 +817,24 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( mm_item_counts, ) else: - prompt_ids, mm_placeholders = self._apply_prompt_updates( - prompt_ids, - mm_prompt_updates, - ) + if use_audio_in_video and "audio" in mm_prompt_updates: + filtered_updates = { + k: v for k, v in mm_prompt_updates.items() if k != "audio" + } + prompt_ids, mm_placeholders = self._apply_prompt_updates( + prompt_ids, + filtered_updates, + ) + # Derive audio placeholders from video placeholders + mm_placeholders = self._derive_audio_from_video_placeholders( + mm_placeholders, mm_prompt_updates + ) + else: + prompt_ids, mm_placeholders = self._apply_prompt_updates( + prompt_ids, + mm_prompt_updates, + ) + self._validate_mm_placeholders( mm_placeholders, mm_item_counts, @@ -962,7 +959,9 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( def get_replacement_qwen2_use_audio_in_video(item_idx: int): nonlocal audio_in_video_item_idx - audio_num_features = audio_output_lengths[audio_item_idx + item_idx] + audio_num_features = audio_output_lengths[ + audio_in_video_item_idx + item_idx + ] video_grid_thw = out_mm_data["video_grid_thw"][item_idx] audio_in_video_item_idx += 1 @@ -971,14 +970,17 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( if second_per_grid_ts: video_second_per_grid_t = second_per_grid_ts[item_idx] else: - video_second_per_grid_t = 1.0 + video_second_per_grid_t = 2.0 - return self.get_updates_use_audio_in_video( + placeholder = self.get_updates_use_audio_in_video( thinker_config=thinker_config, audio_len=audio_num_features, video_grid_thw=video_grid_thw, video_second_per_grid_t=video_second_per_grid_t, ) + return PromptUpdateDetails.select_token_id( + placeholder, embed_token_id=video_token_id + ) video_replacement_fn = ( get_replacement_qwen2_use_audio_in_video @@ -1004,14 +1006,50 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( ), ] - def _validate_mm_placeholders( + def _derive_audio_from_video_placeholders( self, - mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], - mm_item_counts: Mapping[str, int], - ) -> None: - BaseMultiModalProcessor[ - Qwen2_5OmniThinkerProcessingInfo - ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts) + placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], + mm_prompt_updates: MultiModalPromptUpdates, + ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: + """ + Helper to derive audio placeholders from video placeholders when + use_audio_in_video=True. + """ + if "video" not in placeholders: + return placeholders + + # Validate audio and video counts match + num_videos = len(placeholders["video"]) + num_audios = len(mm_prompt_updates.get("audio", [])) + if num_audios != num_videos: + raise ValueError( + f"use_audio_in_video requires equal number of audio and video items, " + f"got {num_audios=}, {num_videos=}" + ) + + tokenizer = self.info.get_tokenizer() + processor = self.info.get_hf_processor() + audio_token_id = tokenizer.get_vocab()[processor.audio_token] + + result_placeholders = dict(placeholders) + audio_placeholders = [] + + # Each video is paired with one audio + for video_idx, video_placeholder in enumerate(placeholders["video"]): + # Create is_embed mask selecting only audio tokens + audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id + + audio_placeholder = PlaceholderFeaturesInfo( + modality="audio", + item_idx=video_idx, + start_idx=video_placeholder.start_idx, + tokens=video_placeholder.tokens, + is_embed=audio_is_embed, + ) + audio_placeholders.append(audio_placeholder) + + result_placeholders["audio"] = audio_placeholders + return result_placeholders def _get_raw_input_ids( self, @@ -1454,7 +1492,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( ) if not len(second_per_grid_ts) and len(video_grid_thw): - second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32) + second_per_grid_ts = 2.0 + second_per_grids = ( + torch.ones(len(video_grid_thw), dtype=torch.float32) + * second_per_grid_ts + ) else: second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index b3da64af750c7..a0d8a78a2ae76 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -287,6 +287,10 @@ _MULTIMODAL_MODELS = { "GraniteSpeechForConditionalGeneration", ), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), + "HunYuanVLForConditionalGeneration": ( + "hunyuan_vision", + "HunYuanVLForConditionalGeneration", + ), "InternVLChatModel": ("internvl", "InternVLChatModel"), "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"), "OpenCUAForConditionalGeneration": ( diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 0471c20429b1d..1e6b53021f888 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -134,6 +134,11 @@ class Platform: _global_graph_pool: Any | None = None + @property + def pass_key(self) -> str: + """Inductor config key for the PassManager custom pass""" + return "post_grad_custom_post_pass" + @property def supported_dtypes(self) -> list[torch.dtype]: """Returns the supported dtypes for the current platform.""" @@ -177,6 +182,21 @@ class Platform: # all ROCm platforms for now. return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) + @classmethod + def get_pass_manager_cls(cls) -> str: + """ + Get the pass manager class for this platform. + It will be registered as a custom pass under the current_platform.pass_key. + """ + return "vllm.compilation.pass_manager.PostGradPassManager" + + @classmethod + def get_compile_backend(cls) -> str: + """ + Get the custom compile backend for current platform. + """ + return cls.simple_compile_backend + @classmethod def device_id_to_physical_device_id(cls, device_id: int): # Treat empty device control env var as unset. This is a valid diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index f3ec965bd0881..0483f6c06ada8 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -262,6 +262,10 @@ class RocmPlatform(Platform): f"is not MLA type while requested for MLA backend." ) + if selected_backend == AttentionBackendEnum.FLEX_ATTENTION: + logger.info("Using FlexAttention backend.") + return AttentionBackendEnum.FLEX_ATTENTION.get_path() + if selected_backend == AttentionBackendEnum.TRITON_ATTN: logger.info("Using Triton Attention backend on V1 engine.") return AttentionBackendEnum.TRITON_ATTN.get_path() @@ -317,8 +321,8 @@ class RocmPlatform(Platform): return AttentionBackendEnum.TRITON_ATTN.get_path() raise RuntimeError( - "V0 attention backends have been removed. Set VLLM_USE_V1=1 " - "to select a supported backend." + f"Attention backend {selected_backend.name} is not supported on " + "ROCm. Note that V0 attention backends have been removed." ) @classmethod diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 5c3dfa8ac9cbc..d1aab98c274e1 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -57,7 +57,7 @@ class PoolingParams( ## Internal use only task: PoolingTask | None = None requires_token_ids: bool = False - skip_reading_prefix_cache: bool = None + skip_reading_prefix_cache: bool | None = None extra_kwargs: dict[str, Any] | None = None output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index fbbe3d4cabb9a..8de961e62db1b 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -3,7 +3,6 @@ """Sampling parameters for text generation.""" import copy -import warnings from dataclasses import field from enum import Enum, IntEnum from functools import cached_property @@ -100,19 +99,6 @@ class StructuredOutputsParams: ) -@dataclass -class GuidedDecodingParams(StructuredOutputsParams): - def __post_init__(self): - warnings.warn( - "GuidedDecodingParams is deprecated. This will be removed in " - "v0.12.0 or v1.0.0, which ever is soonest. Please use " - "StructuredOutputsParams instead.", - DeprecationWarning, - stacklevel=2, - ) - return super().__post_init__() - - class RequestOutputKind(Enum): # Return entire output so far in every RequestOutput CUMULATIVE = 0 @@ -234,8 +220,6 @@ class SamplingParams( # Fields used to construct logits processors structured_outputs: StructuredOutputsParams | None = None """Parameters for configuring structured outputs.""" - guided_decoding: GuidedDecodingParams | None = None - """Deprecated alias for structured_outputs.""" logit_bias: dict[int, float] | None = None """If provided, the engine will construct a logits processor that applies these logit biases.""" @@ -254,7 +238,7 @@ class SamplingParams( generated token can complete the sequence.""" _bad_words_token_ids: list[list[int]] | None = None - skip_reading_prefix_cache: bool = None + skip_reading_prefix_cache: bool | None = None @staticmethod def from_optional( @@ -283,7 +267,6 @@ class SamplingParams( truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, structured_outputs: StructuredOutputsParams | None = None, - guided_decoding: GuidedDecodingParams | None = None, logit_bias: dict[int, float] | dict[str, float] | None = None, allowed_token_ids: list[int] | None = None, extra_args: dict[str, Any] | None = None, @@ -295,16 +278,6 @@ class SamplingParams( int(token): min(100.0, max(-100.0, bias)) for token, bias in logit_bias.items() } - if guided_decoding is not None: - warnings.warn( - "guided_decoding is deprecated. This will be removed in " - "v0.12.0 or v1.0.0, which ever is soonest. Please use " - "structured_outputs instead.", - DeprecationWarning, - stacklevel=2, - ) - structured_outputs = guided_decoding - guided_decoding = None return SamplingParams( n=1 if n is None else n, @@ -387,17 +360,6 @@ class SamplingParams( # eos_token_id is added to this by the engine self._all_stop_token_ids.update(self.stop_token_ids) - if self.guided_decoding is not None: - warnings.warn( - "guided_decoding is deprecated. This will be removed in " - "v0.12.0 or v1.0.0, which ever is soonest. Please use " - "structured_outputs instead.", - DeprecationWarning, - stacklevel=2, - ) - self.structured_outputs = self.guided_decoding - self.guided_decoding = None - if self.skip_reading_prefix_cache is None: # If prefix caching is enabled, # the output of prompt logprobs may less than n_prompt_tokens, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 3d282da8c6112..c1880a3fba0ee 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -86,6 +86,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( deepseek_vl_v2="DeepseekVLV2Config", deepseek_v32="DeepseekV3Config", flex_olmo="FlexOlmoConfig", + hunyuan_vl="HunYuanVLConfig", kimi_linear="KimiLinearConfig", kimi_vl="KimiVLConfig", RefinedWeb="RWConfig", # For tiiuae/falcon-40b(-instruct) @@ -549,6 +550,23 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool: return uses_mrope(thinker_text_config) +def uses_xdrope_dim(config: PretrainedConfig) -> int: + """Detect if the model with this config uses XD-ROPE.""" + xdrope_section = getattr(config, "xdrope_section", None) + if xdrope_section is not None and isinstance(xdrope_section, list): + return len(xdrope_section) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is None: + return 0 + + if isinstance(rope_scaling, dict) and "xdrope_section" in rope_scaling: + xdrope_section = rope_scaling["xdrope_section"] + if xdrope_section is not None and isinstance(xdrope_section, list): + return len(xdrope_section) + + return 0 + + def is_encoder_decoder(config: PretrainedConfig) -> bool: """Detect if the model with this config is used as an encoder/decoder.""" diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index d28fd8d033373..109f2b6986514 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -23,6 +23,11 @@ from vllm.transformers_utils.configs.eagle import EAGLEConfig # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig +from vllm.transformers_utils.configs.hunyuan_vl import ( + HunYuanVLConfig, + HunYuanVLTextConfig, + HunYuanVLVisionConfig, +) from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig @@ -53,6 +58,9 @@ __all__ = [ "DotsOCRConfig", "EAGLEConfig", "FlexOlmoConfig", + "HunYuanVLConfig", + "HunYuanVLTextConfig", + "HunYuanVLVisionConfig", "RWConfig", "JAISConfig", "Lfm2MoeConfig", diff --git a/vllm/transformers_utils/configs/hunyuan_vl.py b/vllm/transformers_utils/configs/hunyuan_vl.py new file mode 100644 index 0000000000000..a826ed9b5155d --- /dev/null +++ b/vllm/transformers_utils/configs/hunyuan_vl.py @@ -0,0 +1,322 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py + +from transformers import PretrainedConfig + + +class HunYuanVLVisionConfig(PretrainedConfig): + model_type = "hunyuan_vl" + base_config_key = "vision_config" + + def __init__( + self, + hidden_act="gelu", + hidden_size=1152, + intermediate_size=4304, + interpolate_mode="bilinear", + rms_norm_eps=1e-05, + learnable_mlp_pooling_size=0, + num_attention_heads=16, + num_key_value_heads=None, + num_channels=3, + num_hidden_layers=27, + out_hidden_size=4096, + patch_size=16, + remove_prenorm=True, + spatial_merge_size=2, + temporal_patch_size=1, + resize_resolution=2048, + img_max_token_num=4096, + max_image_size=2048, + video_max_image_size=768, + video_min_image_size=256, + min_image_size=512, + anyres_vit_max_image_size=2048, + max_vit_seq_len=16384, + text_hidden_size=3072, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_act = hidden_act + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.interpolate_mode = interpolate_mode + self.learnable_mlp_pooling_size = learnable_mlp_pooling_size + self.num_attention_heads = num_attention_heads + if not num_key_value_heads: + self.num_key_value_heads = num_attention_heads + else: + self.num_key_value_heads = num_key_value_heads + self.num_channels = num_channels + self.num_hidden_layers = num_hidden_layers + self.out_hidden_size = out_hidden_size + self.patch_size = patch_size + self.remove_prenorm = remove_prenorm + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.rms_norm_eps = rms_norm_eps + + self.resize_resolution = resize_resolution + self.img_max_token_num = img_max_token_num + self.max_image_size = max_image_size + self.min_image_size = min_image_size + self.video_max_image_size = video_max_image_size + self.video_min_image_size = video_min_image_size + self.anyres_vit_max_image_size = anyres_vit_max_image_size + self.max_vit_seq_len = max_vit_seq_len + self.text_hidden_size = text_hidden_size + + +class HunYuanVLTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an + HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the HunYuan-7B. + Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 290943): + Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`HunYuanVLTextConfig`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations or shared MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + eod_token_id (int, *optional*, defaults to 3): + Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence. + Example: In multi-document processing, this token helps the model distinguish between separate documents. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + head_dim (`int`, *optional*, defaults to 128): + The attention head dimension. + """ # noqa: E501 + + model_type = "hunyuan_vl_text" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=290943, + hidden_size=4096, + intermediate_size: int = 11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + eod_token_id=3, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + head_dim=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.head_dim = head_dim + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + # self._rope_scaling_validation() # TODO: Need validation? + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and " + f"`factor` or `type` and `alpha`, got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + rope_scaling_alpha = self.rope_scaling.get("alpha", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + "`rope_scaling`'s type field must be one of ['linear', 'dynamic'], " + f"got {rope_scaling_type}" + ) + if rope_scaling_factor is None and rope_scaling_alpha is None: + raise ValueError( + "`rope_scaling`'s factor or alpha field must be have one, " + "got both of none" + ) + if rope_scaling_factor is not None and ( + not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0 + ): + raise ValueError( + "`rope_scaling`'s factor field must be a float > 1.0, " + f"got {rope_scaling_factor}" + ) + if rope_scaling_alpha is not None and ( + not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0 + ): + raise ValueError( + "`rope_scaling`'s alpha field must be a float > 1.0, " + f"got {rope_scaling_alpha}" + ) + + +class HunYuanVLConfig(PretrainedConfig): + model_type = "hunyuan_vl" + sub_configs = { + "vision_config": HunYuanVLVisionConfig, + "text_config": HunYuanVLTextConfig, + } + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + text_config=None, + vision_config=None, + im_start_id=120118, + im_end_id=120119, + image_token_id=120120, + im_newline_id=120121, + video_start_id=120122, + video_end_id=120123, + **kwargs, + ): + # We need to init super() here so that it does not reset values + # that are in text config to the BaseClass defaults. The Base + # config has many text related defaults and not all defaults are + # same as for `HunYuanVLTextConfig`. + super().__init__(**kwargs) + + if isinstance(vision_config, dict): + self.vision_config = self.sub_configs["vision_config"](**vision_config) + elif vision_config is None: + self.vision_config = self.sub_configs["vision_config"]() + + if isinstance(text_config, dict): + self.text_config = self.sub_configs["text_config"](**text_config) + elif text_config is None: + # For BC use all kwargs to init `TextConfig` + self.text_config = self.sub_configs["text_config"](**kwargs) + + self.image_token_id = image_token_id + self.im_start_id = im_start_id + self.im_end_id = im_end_id + self.im_newline_id = im_newline_id + self.video_start_id = video_start_id + self.video_end_id = video_end_id + + self.vision_config.text_hidden_size = self.text_config.hidden_size + + # Attention implementation to use. It sets it recursively on sub-configs + # so we call it again in the end. + self._attn_implementation = kwargs.pop("attn_implementation", None) + + def __setattr__(self, key, value): + if ( + (text_config := super().__getattribute__("__dict__").get("text_config")) + is not None + and key not in ["dtype", "_attn_implementation_internal"] + and key in text_config.__dict__ + ): + setattr(text_config, key, value) + else: + super().__setattr__(key, value) + + def __getattribute__(self, key): + if "text_config" in super().__getattribute__("__dict__") and key not in [ + "_name_or_path", + "model_type", + "dtype", + "_attn_implementation_internal", + ]: + text_config = super().__getattribute__("text_config") + if key in text_config.__dict__: + return getattr(text_config, key) + + return super().__getattribute__(key) diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 76b6d3dc9c99a..b49fdbe9ce776 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -9,7 +9,15 @@ reasons: """ from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor +from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor +from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor from vllm.transformers_utils.processors.ovis import OvisProcessor from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor -__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"] +__all__ = [ + "DeepseekVLV2Processor", + "HunYuanVLProcessor", + "HunYuanVLImageProcessor", + "OvisProcessor", + "Ovis2_5Processor", +] diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py new file mode 100644 index 0000000000000..615a8bff85912 --- /dev/null +++ b/vllm/transformers_utils/processors/hunyuan_vl.py @@ -0,0 +1,233 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py + +import numpy as np +import torch +from transformers import AutoProcessor +from transformers.feature_extraction_utils import BatchFeature +from transformers.image_utils import ImageInput +from transformers.processing_utils import ProcessorMixin +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput +from transformers.video_utils import VideoInput + + +class HunYuanVLProcessor(ProcessorMixin): + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" # ("AutoTokenizer", None) + + def __init__( + self, + image_processor=None, + tokenizer=None, + video_processor=None, + chat_template=None, + **kwargs, + ): + # TODO Fix the init + self.tokenizer = tokenizer + self.image_token_id = 120120 # self.tokenizer.image_token_id + self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id) + self.im_start_token_id = 120118 # self.tokenizer.im_start_id + self.im_start_token = self.tokenizer.convert_ids_to_tokens( + self.im_start_token_id + ) + self.im_end_token_id = 120119 # self.tokenizer.im_end_id + self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id) + self.placeholder_token = self.tokenizer.convert_ids_to_tokens( + self.tokenizer.vocab_size - 1 + ) + self.pad_id = 120002 # self.tokenizer.pad_token_id + + super().__init__( + image_processor, tokenizer, video_processor, chat_template=chat_template + ) + + def __call__( + self, + images: ImageInput = None, + text: TextInput + | PreTokenizedInput + | list[TextInput] + | list[PreTokenizedInput] = None, + videos: VideoInput = None, + **kwargs, + ) -> BatchFeature: + image_inputs = {} + if images is not None: + image_inputs = self.image_processor(images=images) + image_grid_thw = image_inputs["image_grid_thw"] + + if not isinstance(text, list): + text = [text] + + text = text.copy() # below lines change text in-place + + image_tokens_cumsum = [0] + if images is not None: + index = 0 + for i in range(len(text)): + while self.image_token in text[i]: + grid_h, grid_w = image_grid_thw[index][-2:] + patch_h = grid_h // self.image_processor.merge_size + patch_w = grid_w // self.image_processor.merge_size + num_image_tokens = patch_h * (patch_w + 1) + 2 + image_tokens_cumsum.append( + image_tokens_cumsum[-1] + num_image_tokens + ) + # text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501 + text[i] = text[i].replace( + self.image_token, self.placeholder_token * num_image_tokens, 1 + ) + index += 1 + text[i] = text[i].replace(self.placeholder_token, self.image_token) + # text[i] = self.tokenizer.bos_token + text[i] + + text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs) + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) + + input_ids = text_inputs["input_ids"] + position_ids = torch.arange(len(input_ids[0])) + position_ids_w = torch.arange(len(input_ids[0])) + position_ids_h = torch.arange(len(input_ids[0])) + position_ids_t = torch.arange(len(input_ids[0])) + + if images is not None: + image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[ + 0 + ] + for i in range(len(image_grid_thw)): + grid_h, grid_w = image_grid_thw[i][-2:] + patch_h = grid_h // self.image_processor.merge_size + patch_w = grid_w // self.image_processor.merge_size + start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1 + replace_num = (patch_w + 1) * patch_h + position_ids_w[start_pos : start_pos + replace_num] = torch.tensor( + list(range(patch_w + 1)) * patch_h, dtype=torch.int64 + ) + patch_h_list = [] + for h in range(patch_h): + patch_h_list += [h] * (patch_w + 1) + position_ids_h[start_pos : start_pos + replace_num] = torch.tensor( + patch_h_list, dtype=torch.int64 + ) + position_ids_t[start_pos : start_pos + replace_num] = 0 + + position_ids = torch.stack( + [position_ids, position_ids_w, position_ids_h, position_ids_t] + ).unsqueeze(0) + text_inputs["position_ids"] = position_ids + + attention_mask = input_ids.ne(self.pad_id) + text_inputs["attention_mask"] = attention_mask + text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)] + # image_inputs["imgs"] = [[image_inputs["pixel_values"]]] + + return_tensors = kwargs.pop("return_tensors", None) + return BatchFeature( + data={**text_inputs, **image_inputs}, + tensor_type=return_tensors, + ) + + def batch_decode(self, *args, **kwargs): + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + return self.tokenizer.decode(*args, **kwargs) + + def post_process_image_text_to_text( + self, + generated_outputs, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + **kwargs, + ): + assert 0 + + def apply_chat_template(self, *args, **kwargs): + token_ids = self.tokenizer.apply_chat_template(*args, **kwargs) + return token_ids + + def get_imgs_pos(self, doc_ids): + doc_ids = np.array(doc_ids, dtype=np.int64) + img_begin_index = np.where(doc_ids == self.im_start_token_id)[0] + img_end_index = np.where(doc_ids == self.im_end_token_id)[0] + imgs_pos = np.concatenate( + ( + np.reshape(img_begin_index + 1, (-1, 1)), + np.reshape(img_end_index, (-1, 1)), + ), + axis=-1, + ).tolist() + return imgs_pos + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +def split_image_into_patch_blocks( + pixel_values: torch.Tensor, # shape: [batch_size, 3, H, W] + patch_size: int = 16, # e.g. 16 + adaptor_patch_div: int = 4, # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501 +) -> torch.Tensor: + """ + Split the input image tensor (supporting batch) into large patches of size `patch_size`, + and then further divide each large patch into smaller regions of size + (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div). + Each small region is extracted as a tensor of shape [3, patch_size, patch_size]. + The final output contains all such small region tensors. + + Args: + pixel_values: Input image tensor of shape [batch_size, 3, H, W]. + patch_size: Size of the large patch, e.g., 16. + adaptor_patch_div: Each large patch is divided into + (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div) + smaller regions. + + Returns: + patches: A tensor of shape [N, 3, patch_size, patch_size], + where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2. + Each element in the batch corresponds to one small image region. + """ # noqa: E501 + batch_size, channels, height, width = pixel_values.shape + assert channels == 3, "Pixel values must have 3 channels in dim=1" + assert height % patch_size == 0 and width % patch_size == 0, ( + "H and W must be divisible by patch_size" + ) + + patch_height_num = height // patch_size + patch_width_num = width // patch_size + + # Reshape to [B, 3, ph, ps, pw, ps] + img = pixel_values.reshape( + batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size + ) + + # Further split each psxps patch into (ps//aps)x(ps//aps) small regions + img = img.reshape( + batch_size, + 3, + patch_height_num, + patch_size // adaptor_patch_div, # ps // aps + adaptor_patch_div, + patch_width_num, + patch_size // adaptor_patch_div, # ps // aps + adaptor_patch_div, + ) + + # Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps] + img = img.permute(0, 2, 5, 3, 6, 1, 4, 7) + + # Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size] + patches = img.reshape(-1, 3, patch_size, patch_size) + + return patches + + +AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor) diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py new file mode 100644 index 0000000000000..0a7e7865c783a --- /dev/null +++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py @@ -0,0 +1,477 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py +"""Image processor class for HunYuanVL.""" + +# isort conflicts with ruff for transformers imports +# isort: skip_file +import math + +import numpy as np +import torchvision.transforms as transforms +from transformers import AutoImageProcessor +from transformers.image_processing_utils import BaseImageProcessor, BatchFeature +from transformers.image_transforms import ( + convert_to_rgb, +) +from transformers.image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + make_flat_list_of_images, + make_list_of_images, + valid_images, + validate_preprocess_arguments, +) +from transformers.utils import TensorType, logging +from transformers.video_utils import VideoInput, make_batched_videos + +logger = logging.get_logger(__name__) + + +def smart_resize( + height: int, + width: int, + factor: int = 16, + min_pixels: int = 512 * 512, + max_pixels: int = 2048 * 2048, +): + """Rescales the image so that the following conditions are met: + + 1. Both dimensions (height and width) are divisible by 'factor'. + + 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. + + 3. The aspect ratio of the image is maintained as closely as possible. + + """ + if max(height, width) / min(height, width) > 200: + raise ValueError( + "absolute aspect ratio must be smaller than 200, got " + f"{max(height, width) / min(height, width)}" + ) + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = max(factor, math.floor(height / beta / factor) * factor) + w_bar = max(factor, math.floor(width / beta / factor) * factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + return h_bar, w_bar + + +class HunYuanVLImageProcessor(BaseImageProcessor): + model_input_names = [ + "pixel_values", + "image_grid_thw", + "pixel_values_videos", + "video_grid_thw", + ] + + def __init__( + self, + do_resize: bool = True, + size: dict[str, int] | None = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: int | float = 1 / 255, + do_normalize: bool = True, + image_mean: float | list[float] | None = None, + image_std: float | list[float] | None = None, + do_convert_rgb: bool = True, + min_pixels: int | None = None, + max_pixels: int | None = None, + patch_size: int = 16, + temporal_patch_size: int = 2, + merge_size: int = 2, + **kwargs, + ) -> None: + super().__init__(**kwargs) + if size is not None and ( + "shortest_edge" not in size or "longest_edge" not in size + ): + raise ValueError( + "size must contain 'shortest_edge' and 'longest_edge' keys." + ) + else: + size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048} + # backward compatibility: override size with min_pixels and max_pixels + # if they are provided. + if min_pixels is not None: + size["shortest_edge"] = min_pixels + if max_pixels is not None: + size["longest_edge"] = max_pixels + self.min_pixels = size["shortest_edge"] + self.max_pixels = size["longest_edge"] + self.size = size + + self.do_resize = do_resize + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.merge_size = merge_size + self.do_convert_rgb = do_convert_rgb + + # hard-code + + def _preprocess( + self, + images: ImageInput | VideoInput, + do_resize: bool | None = None, + size: dict[str, int] | None = None, + resample: PILImageResampling = None, + do_rescale: bool | None = None, + rescale_factor: float | None = None, + do_normalize: bool | None = None, + image_mean: float | list[float] | None = None, + image_std: float | list[float] | None = None, + patch_size: int = 16, + temporal_patch_size: int = 2, + merge_size: int = 2, + do_convert_rgb: bool | None = None, + data_format: ChannelDimension | None = ChannelDimension.FIRST, + input_data_format: str | ChannelDimension | None = None, + ): + """ + Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image. + patch_size (`int`, *optional*, defaults to `self.patch_size`): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to `self.merge_size`): + The merge size of the vision encoder to llm encoder. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ # noqa: E501 + images = make_list_of_images(images) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + width, height = images[0].width, images[0].height + resized_width, resized_height = width, height + processed_images = [] + for image in images: + if do_resize: + resized_width, resized_height = smart_resize( + width, + height, + factor=patch_size * merge_size, + min_pixels=self.min_pixels, + max_pixels=self.max_pixels, + ) + image = image.resize((resized_width, resized_height)) + + if do_normalize: + image = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize(self.image_mean, self.image_std), + ] + )(image) + processed_images.append(image) + + patches = np.array(processed_images) + channel = patches.shape[1] + grid_t = patches.shape[0] // temporal_patch_size + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + patches = patches.reshape( + 1, + channel, + grid_h // merge_size, + merge_size, + patch_size, + grid_w // merge_size, + merge_size, + patch_size, + ) + patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7) + flatten_patches = patches.reshape( + 1 * grid_h * grid_w, channel * patch_size * patch_size + ) + + return flatten_patches, (grid_t, grid_h, grid_w) + + def preprocess( + self, + images: ImageInput, + videos: VideoInput = None, + do_resize: bool | None = None, + size: dict[str, int] | None = None, + min_pixels: int | None = None, + max_pixels: int | None = None, + resample: PILImageResampling = None, + do_rescale: bool | None = None, + rescale_factor: float | None = None, + do_normalize: bool | None = None, + image_mean: float | list[float] | None = None, + image_std: float | list[float] | None = None, + patch_size: int | None = None, + temporal_patch_size: int | None = None, + merge_size: int | None = None, + do_convert_rgb: bool | None = None, + return_tensors: str | TensorType | None = None, + data_format: ChannelDimension | None = ChannelDimension.FIRST, + input_data_format: str | ChannelDimension | None = None, + ): + """ + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + videos (`VideoInput`): + Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If + passing in videos with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + min_pixels (`int`, *optional*, defaults to `self.min_pixels`): + The min pixels of the image to resize the image. + max_pixels (`int`, *optional*, defaults to `self.max_pixels`): + The max pixels of the image to resize the image. + patch_size (`int`, *optional*, defaults to `self.patch_size`): + The spatial patch size of the vision encoder. + temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`): + The temporal patch size of the vision encoder. + merge_size (`int`, *optional*, defaults to `self.merge_size`): + The merge size of the vision encoder to llm encoder. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + """ # noqa: E501 + min_pixels = min_pixels if min_pixels is not None else self.min_pixels + max_pixels = max_pixels if max_pixels is not None else self.max_pixels + + if size is not None: + if "shortest_edge" not in size or "longest_edge" not in size: + raise ValueError( + "size must contain 'shortest_edge' and 'longest_edge' keys." + ) + min_pixels = size["shortest_edge"] + elif min_pixels is not None and max_pixels is not None: + # backward compatibility: override size with min_pixels and max_pixels + # if they are provided. + size = {"shortest_edge": min_pixels, "longest_edge": max_pixels} + else: + size = {**self.size} + + do_resize = do_resize if do_resize is not None else self.do_resize + + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = ( + rescale_factor if rescale_factor is not None else self.rescale_factor + ) + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + patch_size = patch_size if patch_size is not None else self.patch_size + temporal_patch_size = ( + temporal_patch_size + if temporal_patch_size is not None + else self.temporal_patch_size + ) + merge_size = merge_size if merge_size is not None else self.merge_size + do_convert_rgb = ( + do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + ) + + if images is not None: + images = make_flat_list_of_images(images) + + if images is not None and not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + data = {} + if images is not None: + pixel_values, vision_grid_thws = [], [] + for image in images: + patches, image_grid_thw = self._preprocess( + image, + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + merge_size=merge_size, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + ) + pixel_values.extend(patches) + vision_grid_thws.append(image_grid_thw) + pixel_values = np.array(pixel_values) + vision_grid_thws = np.array(vision_grid_thws) + data.update( + {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws} + ) + + # kept for BC only and should be removed after v5.0 + if videos is not None: + logger.warning( + "`HunYuanVLV1ImageProcessor` works only with image inputs " + "and doesn't process videos anymore. " + "This is a deprecated behavior and will be removed in v5.0. " + "Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. " + ) + videos = make_batched_videos(videos) + pixel_values_videos, vision_grid_thws_videos = [], [] + for images in videos: + patches, video_grid_thw = self._preprocess( + images, + do_resize=do_resize, + size=size, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + merge_size=merge_size, + data_format=data_format, + do_convert_rgb=do_convert_rgb, + input_data_format=input_data_format, + ) + pixel_values_videos.extend(patches) + vision_grid_thws_videos.append(video_grid_thw) + data.update( + { + "pixel_values_videos": np.array(pixel_values_videos), + "video_grid_thw": np.array(vision_grid_thws_videos), + } + ) + + return BatchFeature(data=data, tensor_type=return_tensors) + + def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None): + """ + A utility that returns number of image patches for a given image size. + + Args: + height (`int`): + Height of the input image. + width (`int`): + Width of the input image. + images_kwargs (`dict`, *optional*): + Any kwargs to override defaults of the image processor. + Returns: + `int`: Number of image patches per image. + """ + min_pixels = ( + images_kwargs["min_pixels"] + if "min_pixels" in images_kwargs + else self.size["shortest_edge"] + ) + max_pixels = ( + images_kwargs["max_pixels"] + if "max_pixels" in images_kwargs + else self.size["longest_edge"] + ) + patch_size = images_kwargs.get("patch_size", self.patch_size) + merge_size = images_kwargs.get("merge_size", self.merge_size) + + factor = patch_size * merge_size + resized_height, resized_width = smart_resize( + height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels + ) + grid_h, grid_w = resized_height // patch_size, resized_width // patch_size + return grid_h * (grid_w + 1) + 2 + + +AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index d94da71b289f3..fddcc27204307 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -52,9 +52,11 @@ STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER" STR_FLASH_ATTN_VAL: str = "FLASH_ATTN" STR_INVALID_VAL: str = "INVALID" +MASK_64_BITS = (1 << 64) - 1 + def random_uuid() -> str: - return str(uuid.uuid4().hex) + return f"{uuid.uuid4().int & MASK_64_BITS:016x}" # 16 hex chars def length_from_prompt_token_ids_or_embeds( diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py index 1900c50849eca..004baa2d09cde 100644 --- a/vllm/v1/attention/backends/linear_attn.py +++ b/vllm/v1/attention/backends/linear_attn.py @@ -7,6 +7,7 @@ import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.config import VllmConfig from vllm.v1.attention.backends.utils import ( + AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, split_decodes_and_prefills, @@ -35,6 +36,8 @@ class LinearAttentionMetadata: class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]): reorder_batch_threshold: int = 1 + _cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE + def __init__( self, kv_cache_spec: AttentionSpec, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 43aef8a7cca91..87a3aac21d2c3 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -340,6 +340,8 @@ class MLACommonPrefillMetadata: max_seq_lens: list[int] seq_lens: torch.Tensor workspace: torch.Tensor + token_to_seq: torch.Tensor + chunk_total_token: list[int] # for mla DCP padded_local_chunk_seq_lens: list[list[int]] | None = None @@ -839,6 +841,19 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): torch.cumsum( chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32 ) + chunk_total_token = cu_seq_lens_cpu[:, -1] + + max_token_num_over_chunk = chunk_total_token.max().item() + token_to_seq_tensor_cpu = torch.zeros( + [num_chunks, max_token_num_over_chunk], dtype=torch.int32 + ) + range_idx = torch.arange(num_prefills, dtype=torch.int32) + for i in range(num_chunks): + chunk_token_to_seq_tensor = torch.repeat_interleave( + range_idx, chunk_seq_lens[i] + ) + chunk_len = chunk_token_to_seq_tensor.shape[0] + token_to_seq_tensor_cpu[i, :chunk_len] = chunk_token_to_seq_tensor if self.dcp_world_size > 1: local_context_lens_allranks = get_dcp_local_seq_lens( @@ -906,6 +921,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(), max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(), seq_lens=chunk_seq_lens, + token_to_seq=token_to_seq_tensor_cpu.to( + device, non_blocking=True + ), + chunk_total_token=chunk_total_token.tolist(), workspace=self.chunked_prefill_workspace, padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(), local_context_lens_allranks=local_context_lens_allranks.tolist(), @@ -922,6 +941,10 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): seq_tot=chunk_seq_lens.sum(dim=1).tolist(), max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(), seq_lens=chunk_seq_lens, + token_to_seq=token_to_seq_tensor_cpu.to( + device, non_blocking=True + ), + chunk_total_token=chunk_total_token, workspace=self.chunked_prefill_workspace, ) @@ -1638,16 +1661,15 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): output = None iters = len(prefill_metadata.chunked_context.seq_tot) workspace = prefill_metadata.chunked_context.workspace - for i in range(iters): toks = prefill_metadata.chunked_context.seq_tot[i] - ops.gather_and_maybe_dequant_cache( src_cache=kv_c_and_k_pe_cache, dst=workspace, block_table=prefill_metadata.block_table, cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i], - batch_size=attn_metadata.num_prefills, + token_to_seq=prefill_metadata.chunked_context.token_to_seq[i], + num_tokens=prefill_metadata.chunked_context.chunk_total_token[i], kv_cache_dtype=self.kv_cache_dtype, scale=k_scale, seq_starts=prefill_metadata.chunked_context.starts[i], diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 56f9c7a281e7f..00a0a77a1c2f7 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -49,6 +49,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata): paged_kv_last_page_len: torch.Tensor | None = None # The query indptr, shape : [num_decode + 1] qo_indptr: torch.Tensor | None = None + # The dtype of MLA out tensor + attn_out_dtype: torch.dtype = torch.bfloat16 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): @@ -74,6 +76,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): ) self.compilation_config = vllm_config.compilation_config + self.decode_attn_out_dtype = vllm_config.model_config.dtype # kernel block size is always 1. max_num_pages_per_req = vllm_config.model_config.max_model_len max_num_reqs = vllm_config.scheduler_config.max_num_seqs @@ -162,6 +165,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): paged_kv_last_page_len=paged_kv_last_page_len, qo_indptr=qo_indptr, dcp_tot_seq_lens=dcp_tot_seq_lens_device, + attn_out_dtype=self.decode_attn_out_dtype, ) return attn_metadata @@ -242,7 +246,11 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): assert isinstance(q, torch.Tensor) B = q.shape[0] o = torch.zeros( - B, self.num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device + B, + self.num_heads, + self.kv_lora_rank, + dtype=attn_metadata.decode.attn_out_dtype, + device=q.device, ) kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2) @@ -260,6 +268,8 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): attn_metadata.decode.paged_kv_indptr, attn_metadata.decode.paged_kv_indices, attn_metadata.decode.paged_kv_last_page_len, + q_scale=layer._q_scale, + kv_scale=layer._k_scale, ) return o, None diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index b18ba8e8b2c7b..a0033fa650baa 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -971,7 +971,16 @@ def _get_kv_cache_groups_uniform_page_size( # is the minimum number of layers among all attention types. Need a better # strategy if we want to support more complex patterns (e.g., 20 full + 30 # sw, where the group size should be 10). - group_size = min([len(layers) for layers in same_type_layers.values()]) + min_num_layers = min([len(layers) for layers in same_type_layers.values()]) + group_size = min_num_layers + max_num_layers = max([len(layers) for layers in same_type_layers.values()]) + if max_num_layers < min_num_layers * 1.25: + # If the number of layers is not much larger than the minimum number of layers, + # use the maximum number of layers as the group size to avoid too many padding + # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We + # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a + # magic number to avoid too many padding layers. + group_size = max_num_layers grouped_layers = [] for layers in same_type_layers.values(): num_padding_layers = group_size - len(layers) % group_size diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a7ec0de372631..23af014c10364 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -508,9 +508,9 @@ class Scheduler(SchedulerInterface): not self.scheduler_config.enable_chunked_prefill and num_new_tokens > token_budget ): - self.waiting.pop_request() - skipped_waiting_requests.prepend_request(request) - continue + # If chunked_prefill is disabled, + # we can stop the scheduling here. + break num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index e2d82241ce210..bd18a152ffc08 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase): # Setting default values self.record_sleep_state() - # GPU cache - # - # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - gauge_gpu_cache_usage = self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead." - ), - multiprocess_mode="mostrecent", - labelnames=labelnames, - ) - self.gauge_gpu_cache_usage = make_per_engine( - gauge_gpu_cache_usage, engine_indexes, model_name - ) - - # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - counter_gpu_prefix_cache_queries = self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation=( - "GPU prefix cache queries, in terms of number of queried" - "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." - ), - labelnames=labelnames, - ) - self.counter_gpu_prefix_cache_queries = make_per_engine( - counter_gpu_prefix_cache_queries, engine_indexes, model_name - ) - - # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits - # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10 - # TODO: remove in 0.12.0 - if self.show_hidden_metrics: - counter_gpu_prefix_cache_hits = self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached " - "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead." - ), - labelnames=labelnames, - ) - self.counter_gpu_prefix_cache_hits = make_per_engine( - counter_gpu_prefix_cache_hits, engine_indexes, model_name - ) - gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", documentation="KV-cache usage. 1 means 100 percent usage.", @@ -735,39 +684,41 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ) # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds - # TODO: in 0.12, only enable if show_hidden_metrics=True - histogram_time_per_output_token = self._histogram_cls( - name="vllm:time_per_output_token_seconds", - documentation=( - "Histogram of time per output token in seconds." - "DEPRECATED: Use vllm:inter_token_latency_seconds instead." - ), - buckets=[ - 0.01, - 0.025, - 0.05, - 0.075, - 0.1, - 0.15, - 0.2, - 0.3, - 0.4, - 0.5, - 0.75, - 1.0, - 2.5, - 5.0, - 7.5, - 10.0, - 20.0, - 40.0, - 80.0, - ], - labelnames=labelnames, - ) - self.histogram_time_per_output_token = make_per_engine( - histogram_time_per_output_token, engine_indexes, model_name - ) + # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11 + # TODO: remove in 0.13.0 + if self.show_hidden_metrics: + histogram_time_per_output_token = self._histogram_cls( + name="vllm:time_per_output_token_seconds", + documentation=( + "Histogram of time per output token in seconds." + "DEPRECATED: Use vllm:inter_token_latency_seconds instead." + ), + buckets=[ + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.4, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + 20.0, + 40.0, + 80.0, + ], + labelnames=labelnames, + ) + self.histogram_time_per_output_token = make_per_engine( + histogram_time_per_output_token, engine_indexes, model_name + ) histogram_inter_token_latency = self._histogram_cls( name="vllm:inter_token_latency_seconds", @@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.gauge_scheduler_waiting[engine_idx].set( scheduler_stats.num_waiting_reqs ) - if self.show_hidden_metrics: - self.gauge_gpu_cache_usage[engine_idx].set( - scheduler_stats.kv_cache_usage - ) self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage) - if self.show_hidden_metrics: - self.counter_gpu_prefix_cache_queries[engine_idx].inc( - scheduler_stats.prefix_cache_stats.queries - ) - self.counter_gpu_prefix_cache_hits[engine_idx].inc( - scheduler_stats.prefix_cache_stats.hits - ) - self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries ) @@ -1050,7 +989,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.histogram_time_to_first_token[engine_idx].observe(ttft) for itl in iteration_stats.inter_token_latencies_iter: self.histogram_inter_token_latency[engine_idx].observe(itl) - self.histogram_time_per_output_token[engine_idx].observe(itl) + if self.show_hidden_metrics: + self.histogram_time_per_output_token[engine_idx].observe(itl) for finished_request in iteration_stats.finished_requests: self.counter_request_success[finished_request.finish_reason][ diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 3de418f1d13c8..784ccbc04932f 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -40,6 +40,7 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import _SAMPLING_EPS from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.utils import CpuGpuBuffer +from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch logger = init_logger(__name__) @@ -65,6 +66,7 @@ class EagleProposer: self.dtype = vllm_config.model_config.dtype self.max_model_len = vllm_config.model_config.max_model_len self.block_size = vllm_config.cache_config.block_size + self.dp_rank = vllm_config.parallel_config.data_parallel_rank self.num_speculative_tokens = self.speculative_config.num_speculative_tokens self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens self.token_arange_np = np.arange(self.max_num_tokens) @@ -83,6 +85,9 @@ class EagleProposer: self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None self.attn_layer_names: list[str] = [] self.indexer_layer_names: list[str] = [] + self.eagle3_use_aux_hidden_state: bool = ( + self._get_eagle3_use_aux_hidden_state_from_config() + ) self.use_cuda_graph = False @@ -268,15 +273,24 @@ class EagleProposer: assert draft_indexer_metadata is not None per_layer_attn_metadata[layer_name] = draft_indexer_metadata + num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp( + num_tokens_unpadded=num_tokens, + num_tokens_padded=num_tokens, + ) + cudagraph_runtime_mode = CUDAGraphMode.NONE if ( self.use_cuda_graph - and num_tokens <= self.compilation_config.max_cudagraph_capture_size + and num_tokens_dp_padded + <= self.compilation_config.max_cudagraph_capture_size ): - num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) + num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens_dp_padded) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: - num_input_tokens = num_tokens + num_input_tokens = num_tokens_dp_padded + if num_tokens_across_dp is not None: + num_tokens_across_dp[self.dp_rank] = num_input_tokens + # copy inputs to buffer for cudagraph self._set_positions(num_tokens, target_positions) self.hidden_states[:num_tokens] = target_hidden_states @@ -300,6 +314,7 @@ class EagleProposer: per_layer_attn_metadata, self.vllm_config, num_tokens=num_input_tokens, + num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, ): ret_hidden_states = self.model( @@ -362,15 +377,23 @@ class EagleProposer: # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] + batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp( + num_tokens_unpadded=batch_size, + num_tokens_padded=batch_size, + ) + if ( self.use_cuda_graph - and batch_size <= self.compilation_config.max_cudagraph_capture_size + and batch_size_dp_padded + <= self.compilation_config.max_cudagraph_capture_size ): - input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) + input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size_dp_padded) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: - input_batch_size = batch_size + input_batch_size = batch_size_dp_padded cudagraph_runtime_mode = CUDAGraphMode.NONE + if batch_size_across_dp is not None: + batch_size_across_dp[self.dp_rank] = input_batch_size common_attn_metadata.num_actual_tokens = batch_size common_attn_metadata.max_query_len = 1 @@ -471,6 +494,7 @@ class EagleProposer: per_layer_attn_metadata, self.vllm_config, num_tokens=input_batch_size, + num_tokens_across_dp=batch_size_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, ): ret_hidden_states = self.model( @@ -1113,36 +1137,56 @@ class EagleProposer: self, num_tokens: int, use_cudagraphs=True, + is_graph_capturing=False, ) -> None: # Determine if CUDA graphs should be used for this run. cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph - if ( - cudagraphs_enabled - and num_tokens <= self.compilation_config.max_cudagraph_capture_size - ): - num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) - with set_forward_context( - None, - self.vllm_config, - num_tokens=num_tokens, - cudagraph_runtime_mode=( - CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE - ), + # FIXME: when using tree-based specdec, adjust number of forward-passes + # according to the depth of the tree. + for fwd_idx in range( + self.num_speculative_tokens if not is_graph_capturing else 1 ): - if self.supports_mm_inputs: - input_ids = None - inputs_embeds = self.inputs_embeds[:num_tokens] - else: - input_ids = self.input_ids[:num_tokens] - inputs_embeds = None + if fwd_idx <= 1: + num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp( + num_tokens_unpadded=num_tokens, + num_tokens_padded=num_tokens, + ) + if ( + cudagraphs_enabled + and num_tokens_dp_padded + <= self.compilation_config.max_cudagraph_capture_size + ): + num_input_tokens = self.vllm_config.pad_for_cudagraph( + num_tokens_dp_padded + ) + else: + num_input_tokens = num_tokens_dp_padded + if num_tokens_across_dp is not None: + num_tokens_across_dp[self.dp_rank] = num_input_tokens - self.model( - input_ids=input_ids, - positions=self._get_positions(num_tokens), - hidden_states=self.hidden_states[:num_tokens], - inputs_embeds=inputs_embeds, - ) + with set_forward_context( + None, + self.vllm_config, + num_tokens=num_input_tokens, + num_tokens_across_dp=num_tokens_across_dp, + cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE + if cudagraphs_enabled + else CUDAGraphMode.NONE, + ): + if self.supports_mm_inputs: + input_ids = None + inputs_embeds = self.inputs_embeds[:num_input_tokens] + else: + input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None + + self.model( + input_ids=input_ids, + positions=self._get_positions(num_input_tokens), + hidden_states=self.hidden_states[:num_input_tokens], + inputs_embeds=inputs_embeds, + ) def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder: """Find and return the attention metadata builders for EAGLE layers. @@ -1169,6 +1213,22 @@ class EagleProposer: ) return builder + def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool: + """ + Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary + hidden states and directly uses the last layer output just like eagle1. + They might indicate this by setting "use_aux_hidden_state" to False + inside the "eagle_config" dict of their hf_config. + """ + if self.method != "eagle3": + return False + # Assume that eagle3 heads use aux hidden states by default + use_aux_hidden_state = True + eagle_config = getattr(self.draft_model_config.hf_config, "eagle_config", None) + if eagle_config is not None: + use_aux_hidden_state = eagle_config.get("use_aux_hidden_state", True) + return use_aux_hidden_state + def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None: """ Validate that all eagle layers belong to the same KVCacheGroup. @@ -1192,6 +1252,28 @@ class EagleProposer: == 1 ), "All eagle layers should belong to the same kv cache group" + def _pad_batch_across_dp( + self, + num_tokens_unpadded: int, + num_tokens_padded: int, + ) -> tuple[int, torch.Tensor]: + # TODO(Flechman): support DBO ubatching + ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp( + num_tokens_unpadded=num_tokens_unpadded, + parallel_config=self.vllm_config.parallel_config, + allow_microbatching=False, + allow_dp_padding=self.use_cuda_graph, + num_tokens_padded=num_tokens_padded, + uniform_decode=None, + num_scheduled_tokens_per_request=None, + ) + assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE" + + num_tokens_dp_padded = num_tokens_padded + if num_toks_across_dp is not None: + num_tokens_dp_padded = int(num_toks_across_dp[self.dp_rank].item()) + return num_tokens_dp_padded, num_toks_across_dp + # NOTE(woosuk): Currently, the below code is not used and we always use argmax # to sample the draft tokens. We will use this after we find a way to manage diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py index 7675cb45170b5..2a7048ae3c0e0 100644 --- a/vllm/v1/worker/gpu/input_batch.py +++ b/vllm/v1/worker/gpu/input_batch.py @@ -4,7 +4,6 @@ from dataclasses import dataclass from typing import Any import numba -import numba.types as types import numpy as np import torch @@ -37,6 +36,9 @@ class InputBuffers: self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device) self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32) + # Spec decoding. + self.next_prefill_tokens = self._make_buffer(max_num_reqs, dtype=torch.int32) + # Structured outputs. self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32) self.grammar_bitmask = self._make_buffer( @@ -144,80 +146,42 @@ class InputBatch: ) -# NOTE: With the type annotations, this function is pre-compiled -# before the first call. -@numba.jit( - [ - types.none( - types.int32[:], # idx_mapping - types.int32[:], # num_scheduled_tokens - types.int32[:, :], # prefill_token_ids - types.int32[:], # num_computed_prefill_tokens - types.int32[:], # prefill_len - types.int32[:], # input_ids - types.int32[:], # query_start_loc - ) - ], - nopython=True, - cache=True, -) +@numba.njit(cache=True) def _prepare_prefill_inputs( - idx_mapping: np.ndarray, # batch_idx -> req_idx - num_scheduled_tokens: np.ndarray, # [B] + idx_mapping: np.ndarray, # [B] + query_lens: np.ndarray, # [B] + query_start_loc: np.ndarray, # [B + 1] prefill_token_ids: np.ndarray, # [N, max_model_len] num_computed_prefill_tokens: np.ndarray, # [N] - prefill_len: np.ndarray, # [N] input_ids: np.ndarray, # [num_input_tokens] - query_start_loc: np.ndarray, # [B + 1] ) -> None: - num_reqs = num_scheduled_tokens.shape[0] - query_start_loc[0] = 0 - - cu_num_tokens = 0 + num_reqs = idx_mapping.shape[0] + query_starts = query_start_loc[:num_reqs] + query_ends = query_start_loc[1 : num_reqs + 1] + starts = num_computed_prefill_tokens[idx_mapping] + ends = starts + query_lens for i in range(num_reqs): - req_idx = idx_mapping[i] - query_len = num_scheduled_tokens[i] - - start = num_computed_prefill_tokens[req_idx] - end = min(start + query_len, prefill_len[req_idx]) - n = end - start - - start_idx = cu_num_tokens - input_ids[start_idx : start_idx + n] = prefill_token_ids[req_idx, start:end] - - cu_num_tokens = start_idx + query_len - query_start_loc[i + 1] = cu_num_tokens - - # Pad the inputs for CUDA graphs. - # Note: pad query_start_loc to be non-decreasing, as kernels - # like FlashAttention requires that - query_start_loc[num_reqs + 1 :].fill(cu_num_tokens) + input_ids[query_starts[i] : query_ends[i]] = prefill_token_ids[ + idx_mapping[i], starts[i] : ends[i] + ] def prepare_prefill_inputs( idx_mapping: np.ndarray, num_scheduled_tokens: np.ndarray, - total_num_tokens: int, + query_start_loc: np.ndarray, prefill_token_ids: np.ndarray, num_computed_prefill_tokens: np.ndarray, - prefill_len: np.ndarray, - input_ids: CpuGpuBuffer, - query_start_loc: CpuGpuBuffer, + input_ids: np.ndarray, ) -> None: _prepare_prefill_inputs( idx_mapping, num_scheduled_tokens, + query_start_loc, prefill_token_ids, num_computed_prefill_tokens, - prefill_len, - input_ids.np, - query_start_loc.np, + input_ids, ) - input_ids.copy_to_gpu(total_num_tokens) - # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens - # tensors from CPU to GPU, because they may include paddings needed - # for full CUDA graph mode. - query_start_loc.copy_to_gpu() @triton.jit @@ -380,8 +344,8 @@ def _post_update_kernel( sampled_tokens_ptr, sampled_tokens_stride, num_sampled_ptr, + num_rejected_ptr, query_start_loc_ptr, - cu_num_logits_ptr, ): req_id = tl.program_id(0) req_state_idx = tl.load(idx_mapping_ptr + req_id) @@ -396,17 +360,10 @@ def _post_update_kernel( query_start = tl.load(query_start_loc_ptr + req_id) query_end = tl.load(query_start_loc_ptr + req_id + 1) query_len = query_end - query_start + num_rejected = tl.load(num_rejected_ptr + req_id) num_computed = tl.load(num_computed_tokens_ptr + req_state_idx) - num_computed += query_len - # Consider the rejected tokens in spec decoding. - if num_sampled > 0: - # NOTE(woosuk): We must skip num_sampled == 0 to account for chunked prefills. - logits_start = tl.load(cu_num_logits_ptr + req_id) - logits_end = tl.load(cu_num_logits_ptr + req_id + 1) - num_logits = logits_end - logits_start - num_rejected = num_logits - num_sampled - num_computed -= num_rejected + num_computed += query_len - num_rejected tl.store(num_computed_tokens_ptr + req_state_idx, num_computed) @@ -421,10 +378,10 @@ def post_update( sampled_tokens: torch.Tensor, # [num_reqs] num_sampled: torch.Tensor, + # [num_reqs] + num_rejected: torch.Tensor, # [num_reqs + 1] query_start_loc: torch.Tensor, - # [num_reqs + 1] - cu_num_logits: torch.Tensor, ) -> None: num_reqs = idx_mapping.shape[0] _post_update_kernel[(num_reqs,)]( @@ -434,7 +391,7 @@ def post_update( sampled_tokens, sampled_tokens.stride(0), num_sampled, + num_rejected, query_start_loc, - cu_num_logits, num_warps=1, ) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 6e332ee4b75b8..e34a45f979807 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -45,7 +45,11 @@ from vllm.v1.worker.gpu.input_batch import ( prepare_prefill_inputs, ) from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs -from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample +from vllm.v1.worker.gpu.spec_decode import init_speculator +from vllm.v1.worker.gpu.spec_decode.rejection_sample import ( + get_num_rejected, + rejection_sample, +) from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin @@ -97,16 +101,20 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): if self.use_async_scheduling: self.input_prep_event = torch.cuda.Event() self.structured_outputs_event = torch.cuda.Event() + self.spec_decode_event = torch.cuda.Event() else: self.input_prep_event = None self.structured_outputs_event = None + self.spec_decode_event = None if self.speculative_config is not None: self.do_spec_decode = True self.num_speculative_steps = self.speculative_config.num_speculative_tokens + self.speculator = init_speculator(self.vllm_config, self.device) else: self.do_spec_decode = False self.num_speculative_steps = 0 + self.speculator = None self.req_states = RequestState( max_num_reqs=self.max_num_reqs, @@ -153,6 +161,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.vllm_config, self.device, ) + if self.do_spec_decode: + self.speculator.load_model(self.model) time_after_load = time.perf_counter() self.model_memory_usage = m.consumed_memory @@ -285,6 +295,35 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): logits = self.model.compute_logits(hidden_states) self.sampler(logits, sampling_metadata) + @torch.inference_mode() + def _dummy_speculator_run( + self, + hidden_states: torch.Tensor, + aux_hidden_states: list[torch.Tensor] | None, + ) -> None: + num_tokens = hidden_states.shape[0] + num_reqs = min(num_tokens, self.max_num_reqs) + input_batch = InputBatch.make_dummy( + num_reqs=num_reqs, + num_tokens=num_tokens, + input_buffers=self.input_buffers, + device=self.device, + ) + sampling_metadata = SamplingMetadata.make_dummy( + num_reqs=num_reqs, + device=self.device, + ) + num_sampled = torch.ones(num_reqs, dtype=torch.int32, device=self.device) + num_rejected = torch.zeros(num_reqs, dtype=torch.int32, device=self.device) + self.propose_draft( + input_batch=input_batch, + sampling_metadata=sampling_metadata, + last_hidden_states=hidden_states, + aux_hidden_states=aux_hidden_states, + num_sampled=num_sampled, + num_rejected=num_rejected, + ) + @torch.inference_mode() def profile_run(self) -> None: hidden_states, sample_hidden_states = self._dummy_run( @@ -292,6 +331,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): skip_attn=True, ) self._dummy_sampler_run(sample_hidden_states) + if self.do_spec_decode: + self._dummy_speculator_run(hidden_states, None) torch.cuda.synchronize() del hidden_states, sample_hidden_states gc.collect() @@ -466,20 +507,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks] block_tables = self.block_tables.gather_block_tables(idx_mapping) - # Copy prefill tokens from CPU to GPU and get query_start_loc. + # Get query_start_loc. + np.cumsum( + num_scheduled_tokens, + out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1], + ) + # Pad for full CUDA graph mode. + # Some attention backends like FA3 require query_start_loc to be non-decreasing. + self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens + self.input_buffers.query_start_loc.copy_to_gpu() + query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1] + query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1] + + # Copy prefill tokens from CPU to GPU. prepare_prefill_inputs( idx_mapping_np, num_scheduled_tokens, - num_tokens, + query_start_loc_np, self.req_states.prefill_token_ids, self.req_states.num_computed_prefill_tokens, - self.req_states.prefill_len.np, - self.input_buffers.input_ids, - self.input_buffers.query_start_loc, + self.input_buffers.input_ids.np, ) - query_start_loc = self.input_buffers.query_start_loc - query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1] - query_start_loc_np = query_start_loc.np[: num_reqs + 1] + self.input_buffers.input_ids.copy_to_gpu(num_tokens) # Prepare positions and seq_lens. prepare_pos_seq_lens( @@ -562,7 +611,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): input_batch: InputBatch, sampling_metadata: SamplingMetadata, grammar_output: GrammarOutput | None, - ) -> tuple[SamplerOutput, torch.Tensor]: + ) -> tuple[SamplerOutput, torch.Tensor, torch.Tensor]: sample_hidden_states = hidden_states[input_batch.logits_indices] logits = self.model.compute_logits(sample_hidden_states) if grammar_output is not None: @@ -588,6 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # No draft tokens (common case). # 0 if chunked-prefilling, 1 if not. num_sampled = (~is_chunked_prefilling).int() + num_rejected = torch.zeros_like(num_sampled) else: # Draft tokens for spec decoding. input_ids = input_batch.input_ids[input_batch.logits_indices] @@ -598,9 +648,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.num_speculative_steps, ) num_sampled *= ~is_chunked_prefilling + num_rejected = get_num_rejected( + input_batch.cu_num_logits, + num_sampled, + ) sampler_output.sampled_token_ids = sampled_tokens # TODO(woosuk): Support logprobs with spec decoding. - return sampler_output, num_sampled + return sampler_output, num_sampled, num_rejected def compute_prompt_logprobs( self, @@ -706,6 +760,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): input_batch: InputBatch, sampled_tokens: torch.Tensor, num_sampled: torch.Tensor, + num_rejected: torch.Tensor, ) -> None: # Update the number of computed tokens. post_update( @@ -714,8 +769,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.req_states.last_sampled_tokens, sampled_tokens, num_sampled, + num_rejected, input_batch.query_start_loc, - input_batch.cu_num_logits, ) # Update the number of computed prefill tokens. @@ -727,6 +782,43 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.req_states.prefill_len.np[idx_mapping_np], ) + @torch.inference_mode() + def propose_draft( + self, + input_batch: InputBatch, + sampling_metadata: SamplingMetadata, + last_hidden_states: torch.Tensor, + aux_hidden_states: list[torch.Tensor] | None, + num_sampled: torch.Tensor, + num_rejected: torch.Tensor, + ) -> torch.Tensor: + num_reqs = input_batch.num_reqs + idx_mapping_np = input_batch.idx_mapping_np + with async_barrier(self.spec_decode_event): + self.input_buffers.next_prefill_tokens.np[:num_reqs] = ( + self.req_states.prefill_token_ids[ + idx_mapping_np, + self.req_states.num_computed_prefill_tokens[idx_mapping_np], + ] + ) + next_prefill_tokens = self.input_buffers.next_prefill_tokens.copy_to_gpu( + num_reqs + ) + + assert self.speculator is not None + draft_tokens = self.speculator.propose( + input_batch, + sampling_metadata, + last_hidden_states, + aux_hidden_states, + num_sampled, + num_rejected, + self.req_states.last_sampled_tokens, + next_prefill_tokens, + ) + self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens + return draft_tokens + def get_cudagraph_and_dp_padding( self, scheduler_output: SchedulerOutput, @@ -879,7 +971,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.execute_model_state = None # type: ignore assert sampling_metadata is not None - sampler_output, num_sampled_tokens = self.sample( + sampler_output, num_sampled, num_rejected = self.sample( hidden_states, input_batch, sampling_metadata, grammar_output ) prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch) @@ -900,7 +992,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): async_output = AsyncOutput( model_runner_output=model_runner_output, sampler_output=sampler_output, - num_sampled_tokens=num_sampled_tokens, + num_sampled_tokens=num_sampled, copy_stream=self.output_copy_stream, copy_event=self.output_copy_event, ) @@ -911,8 +1003,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # This sequencing may slightly reduce latency as async D2H copy does not # need to wait for the postprocess to finish. self.postprocess( - input_batch, sampler_output.sampled_token_ids, num_sampled_tokens + input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected ) + if self.do_spec_decode: + _ = self.propose_draft( + input_batch, + sampling_metadata, + hidden_states, + None, # aux_hidden_states + num_sampled, + num_rejected, + ) if self.use_async_scheduling: return async_output diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py index c48ed2d8ca167..d8676079ab951 100644 --- a/vllm/v1/worker/gpu/sampler.py +++ b/vllm/v1/worker/gpu/sampler.py @@ -100,8 +100,9 @@ def _gumbel_sample_kernel( mask=mask, other=float("-inf"), ) + logits = logits.to(tl.float32) - temp = tl.load(temp_ptr + req_idx) + temp = tl.load(temp_ptr + req_idx).to(tl.float32) if temp != 0.0: # Calculate the seed for gumbel noise. seed = tl.load(seeds_ptr + req_idx) @@ -116,7 +117,7 @@ def _gumbel_sample_kernel( # Apply temperature. if APPLY_TEMPERATURE: # NOTE(woosuk): Use div_rn to match the behavior of torch. - logits = tl.div_rn(logits, temp.to(tl.float32)) + logits = tl.div_rn(logits, temp) # Apply gumbel noise. logits = tl.where(mask, logits + gumbel_noise, float("-inf")) diff --git a/vllm/v1/worker/gpu/spec_decode/__init__.py b/vllm/v1/worker/gpu/spec_decode/__init__.py index e69de29bb2d1d..15b85204e05ce 100644 --- a/vllm/v1/worker/gpu/spec_decode/__init__.py +++ b/vllm/v1/worker/gpu/spec_decode/__init__.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch + +from vllm.config import VllmConfig + + +def init_speculator( + vllm_config: VllmConfig, + device: torch.device, +): + speculative_config = vllm_config.speculative_config + assert speculative_config is not None + if speculative_config.use_eagle(): + from vllm.v1.worker.gpu.spec_decode.eagle import EagleSpeculator + + return EagleSpeculator(vllm_config, device) + raise NotImplementedError(f"{speculative_config.method} is not supported yet.") diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py new file mode 100644 index 0000000000000..3c8621cc69c97 --- /dev/null +++ b/vllm/v1/worker/gpu/spec_decode/eagle.py @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +import torch.nn as nn + +from vllm.config import VllmConfig +from vllm.config.compilation import CUDAGraphMode +from vllm.forward_context import set_forward_context +from vllm.model_executor.model_loader import get_model +from vllm.triton_utils import tl, triton +from vllm.v1.worker.gpu.input_batch import InputBatch +from vllm.v1.worker.gpu.sampler import gumbel_sample +from vllm.v1.worker.gpu.states import SamplingMetadata + + +class EagleSpeculator: + def __init__(self, vllm_config: VllmConfig, device: torch.device): + self.vllm_config = vllm_config + self.device = device + + self.speculative_config = vllm_config.speculative_config + assert self.speculative_config is not None + self.method = self.speculative_config.method + self.num_speculative_steps = self.speculative_config.num_speculative_tokens + self.draft_model_config = self.speculative_config.draft_model_config + + self.scheduler_config = vllm_config.scheduler_config + self.max_num_reqs = self.scheduler_config.max_num_seqs + self.max_num_tokens = self.scheduler_config.max_num_batched_tokens + + self.input_ids = torch.zeros( + self.max_num_tokens, dtype=torch.int32, device=device + ) + self.positions = torch.zeros( + self.max_num_tokens, dtype=torch.int64, device=device + ) + + def load_model(self, target_model: nn.Module) -> None: + from vllm.compilation.backends import set_model_tag + + with set_model_tag("eagle_head"): + self.model = get_model( + vllm_config=self.vllm_config, model_config=self.draft_model_config + ) + + share_lm_head = True + if share_lm_head and hasattr(target_model, "lm_head"): + if hasattr(self.model, "lm_head"): + del self.model.lm_head + self.model.lm_head = target_model.lm_head + + @torch.inference_mode() + def propose( + self, + input_batch: InputBatch, + sampling_metadata: SamplingMetadata, + # [num_tokens, hidden_size] + last_hidden_states: torch.Tensor, + # num_layers x [num_tokens, hidden_size] + aux_hidden_states: list[torch.Tensor] | None, + # [num_reqs] + num_sampled: torch.Tensor, + # [num_reqs] + num_rejected: torch.Tensor, + # [max_num_reqs, 1] + last_sampled: torch.Tensor, + # [num_reqs] + next_prefill_tokens: torch.Tensor, + ) -> torch.Tensor: + # NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the + # number of rejected tokens, we maintain the size of eagle's input_ids and + # hidden_states the same as the target model's. This means, we pad each + # request's query length to include any rejected positions. By doing so, + # we can also reuse the attention metadata (e.g., query_start_loc, + # seq_lens) of the target model. + if aux_hidden_states: + assert self.method == "eagle3" + hidden_states = self.model.combine_hidden_states( + torch.cat(aux_hidden_states, dim=-1) + ) + else: + hidden_states = last_hidden_states + + # Get the input ids and last token indices for the speculator. + last_token_indices = prepare_eagle_inputs( + self.input_ids, + input_batch, + num_sampled, + num_rejected, + last_sampled, + next_prefill_tokens, + ) + input_ids = self.input_ids[: input_batch.num_tokens_after_padding] + + # Prefill: Run the eagle speculator with eager mode. + with set_forward_context( + input_batch.attn_metadata, + self.vllm_config, + num_tokens=input_batch.num_tokens_after_padding, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + ): + ret_hidden_states = self.model( + input_ids=input_ids, + positions=input_batch.positions, + hidden_states=hidden_states, + ) + if self.method == "mtp": + last_hidden_states = ret_hidden_states + hidden_states = ret_hidden_states + else: + last_hidden_states, hidden_states = ret_hidden_states + sample_hidden_states = last_hidden_states[last_token_indices] + logits = self.model.compute_logits(sample_hidden_states) + + num_reqs = input_batch.num_reqs + cu_num_logits = input_batch.cu_num_logits[:num_reqs] + temperature = sampling_metadata.temperature[cu_num_logits] + seed = sampling_metadata.seeds[cu_num_logits] + # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise + # used for draft and target sampling. + pos = input_batch.positions[last_token_indices] + 1 + # NOTE(woosuk): For draft sampling, we only consider the temperature + # and ignore the other sampling parameters such as top_k and top_p, + # for simplicity and performance. + # While this may slightly degrade the acceptance rate, it does not + # affect the output distribution after rejection sampling. + draft_tokens = gumbel_sample( + logits, temperature, seed, pos, apply_temperature=True + ) + if self.num_speculative_steps == 1: + # Early exit. + return draft_tokens.view(-1, 1) + raise NotImplementedError("num_speculative_steps > 1 is not supported yet.") + + +@triton.jit +def _prepare_eagle_inputs_kernel( + last_token_indices_ptr, + eagle_input_ids_ptr, + target_input_ids_ptr, + idx_mapping_ptr, + last_sampled_ptr, + next_prefill_tokens_ptr, + num_sampled_ptr, + num_rejected_ptr, + query_start_loc_ptr, + BLOCK_SIZE: tl.constexpr, +): + batch_idx = tl.program_id(0) + query_start = tl.load(query_start_loc_ptr + batch_idx) + query_end = tl.load(query_start_loc_ptr + batch_idx + 1) + query_len = query_end - query_start + + # Get the true query length and next token after accounting for rejected tokens. + num_rejected = tl.load(num_rejected_ptr + batch_idx) + query_len -= num_rejected + + num_sampled = tl.load(num_sampled_ptr + batch_idx) + if num_sampled > 0: + req_state_idx = tl.load(idx_mapping_ptr + batch_idx) + next_token = tl.load(last_sampled_ptr + req_state_idx).to(tl.int32) + else: + # Chunked prefilling. + # Get the next prefill token. + next_token = tl.load(next_prefill_tokens_ptr + batch_idx) + + # Shift target_input_ids by one. + for i in range(1, query_len, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < query_len + input_ids = tl.load(target_input_ids_ptr + query_start + block, mask=mask) + tl.store(eagle_input_ids_ptr + query_start + block - 1, input_ids, mask=mask) + + last_token_index = query_start + query_len - 1 + tl.store(last_token_indices_ptr + batch_idx, last_token_index) + tl.store(eagle_input_ids_ptr + last_token_index, next_token) + + +def prepare_eagle_inputs( + eagle_input_ids: torch.Tensor, + input_batch: InputBatch, + # [num_reqs] + num_sampled: torch.Tensor, + # [num_reqs] + num_rejected: torch.Tensor, + # [max_num_reqs, 1] + last_sampled: torch.Tensor, + # [max_num_reqs] + next_prefill_tokens: torch.Tensor, +) -> torch.Tensor: + num_reqs = input_batch.num_reqs + last_token_indices = torch.empty( + num_reqs, + dtype=torch.int64, + device=eagle_input_ids.device, + ) + _prepare_eagle_inputs_kernel[(num_reqs,)]( + last_token_indices, + eagle_input_ids, + input_batch.input_ids, + input_batch.idx_mapping, + last_sampled, + next_prefill_tokens, + num_sampled, + num_rejected, + input_batch.query_start_loc, + BLOCK_SIZE=1024, + ) + return last_token_indices diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py index 8a7bf28bacbd4..43c6ac518bccc 100644 --- a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py +++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py @@ -69,3 +69,15 @@ def rejection_sample( num_warps=1, ) return sampled, num_sampled + + +@torch.compile(dynamic=True) +def get_num_rejected( + cu_num_logits: torch.Tensor, + num_sampled: torch.Tensor, +) -> torch.Tensor: + num_logits = cu_num_logits[1:] - cu_num_logits[:-1] + num_rejected = num_logits - num_sampled + # No token is rejected for chunked prefills. + num_rejected *= num_sampled > 0 + return num_rejected diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 4a2818ab1bfd8..e7991baeaa1b8 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -43,6 +43,8 @@ class CachedRequestState: mrope_positions: torch.Tensor | None = None mrope_position_delta: int | None = None + xdrope_positions: torch.Tensor | None = None + lora_request: LoRARequest | None = None prompt_embeds: torch.Tensor | None = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cbafc9c993cc2..74fd2a1e2a2c0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -50,16 +50,21 @@ from vllm.distributed.parallel_state import ( from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase -from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.model_executor.layers.rotary_embedding import ( + MRotaryEmbedding, + XDRotaryEmbedding, +) from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.model_executor.models.interfaces import ( SupportsMRoPE, SupportsMultiModal, + SupportsXDRoPE, is_mixture_of_experts, supports_eagle3, supports_mrope, supports_multimodal_pruning, supports_transcription, + supports_xdrope, ) from vllm.model_executor.models.interfaces_base import ( VllmModelForPooling, @@ -324,6 +329,7 @@ class GPUModelRunner( # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope + self.uses_xdrope_dim = model_config.uses_xdrope_dim self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( model_config ) @@ -375,7 +381,9 @@ class GPUModelRunner( elif self.speculative_config.use_eagle(): self.drafter = EagleProposer(self.vllm_config, self.device, self) if self.speculative_config.method == "eagle3": - self.use_aux_hidden_state_outputs = True + self.use_aux_hidden_state_outputs = ( + self.drafter.eagle3_use_aux_hidden_state + ) elif self.speculative_config.method == "medusa": self.drafter = MedusaProposer( vllm_config=self.vllm_config, device=self.device @@ -510,6 +518,13 @@ class GPUModelRunner( (3, self.max_num_tokens + 1), dtype=torch.int64 ) + # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) + if self.uses_xdrope_dim > 0: + # Similar to mrope but use assigned dimension number for RoPE, 4 as default. + self.xdrope_positions = self._make_buffer( + (self.uses_xdrope_dim, self.max_num_tokens + 1), dtype=torch.int64 + ) + # None in the first PP rank. The rest are set after load_model. self.intermediate_tensors: IntermediateTensors | None = None @@ -591,10 +606,14 @@ class GPUModelRunner( if isinstance(num_tokens, int): if self.uses_mrope: return self.mrope_positions.gpu[:, :num_tokens] + if self.uses_xdrope_dim > 0: + return self.xdrope_positions.gpu[:, :num_tokens] return self.positions.gpu[:num_tokens] else: if self.uses_mrope: return self.mrope_positions.gpu[:, num_tokens] + if self.uses_xdrope_dim > 0: + return self.xdrope_positions.gpu[:, num_tokens] return self.positions.gpu[num_tokens] def _make_buffer( @@ -770,6 +789,10 @@ class GPUModelRunner( if self.uses_mrope: self._init_mrope_positions(req_state) + # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) + if self.uses_xdrope_dim > 0: + self._init_xdrope_positions(req_state) + reqs_to_add.append(req_state) # Update the states of the running/resumed requests. @@ -985,6 +1008,19 @@ class GPUModelRunner( ) ) + def _init_xdrope_positions(self, req_state: CachedRequestState): + model = self.get_model() + xdrope_model = cast(SupportsXDRoPE, model) + assert req_state.prompt_token_ids is not None, ( + "XD-RoPE requires prompt_token_ids to be available." + ) + assert supports_xdrope(model), "XD-RoPE support is not implemented." + + req_state.xdrope_positions = xdrope_model.get_xdrope_input_positions( + req_state.prompt_token_ids, + req_state.mm_features, + ) + def _extract_mm_kwargs( self, scheduler_output: "SchedulerOutput", @@ -1229,6 +1265,11 @@ class GPUModelRunner( if self.uses_mrope: self._calc_mrope_positions(scheduler_output) + # Calculate XD-RoPE positions. + # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) + if self.uses_xdrope_dim > 0: + self._calc_xdrope_positions(scheduler_output) + # Get token indices. # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] @@ -1362,6 +1403,12 @@ class GPUModelRunner( self.mrope_positions.cpu[:, :total_num_scheduled_tokens], non_blocking=True, ) + elif self.uses_xdrope_dim > 0: + # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) + self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_( + self.xdrope_positions.cpu[:, :total_num_scheduled_tokens], + non_blocking=True, + ) else: # Common case (1D positions) self.positions.copy_to_gpu(total_num_scheduled_tokens) @@ -1791,6 +1838,53 @@ class GPUModelRunner( mrope_pos_ptr += completion_part_len + def _calc_xdrope_positions(self, scheduler_output: "SchedulerOutput"): + xdrope_pos_ptr = 0 + for index, req_id in enumerate(self.input_batch.req_ids): + req = self.requests[req_id] + assert req.xdrope_positions is not None + + num_computed_tokens = self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = length_from_prompt_token_ids_or_embeds( + req.prompt_token_ids, req.prompt_embeds + ) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, num_prompt_tokens - num_computed_tokens) + completion_part_len = max(0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's xdrope_positions are pre-computed + dst_start = xdrope_pos_ptr + dst_end = xdrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.xdrope_positions.cpu[:, dst_start:dst_end] = req.xdrope_positions[ + :, src_start:src_end + ] + xdrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's xdrope_positions on-the-fly + dst_start = xdrope_pos_ptr + dst_end = xdrope_pos_ptr + completion_part_len + + XDRotaryEmbedding.get_next_input_positions_tensor( + out=self.xdrope_positions.np, + out_offset=dst_start, + context_len=num_computed_tokens + prompt_part_len, + num_new_tokens=completion_part_len, + ) + + xdrope_pos_ptr += completion_part_len + def _calc_spec_decode_metadata( self, num_draft_tokens: np.ndarray, @@ -2035,6 +2129,7 @@ class GPUModelRunner( req_start_idx = 0 should_sync_mrope_positions = False + should_sync_xdrope_positions = False for req_id in self.input_batch.req_ids: mm_embeds_req: list[torch.Tensor] = [] @@ -2108,6 +2203,10 @@ class GPUModelRunner( self._calc_mrope_positions(scheduler_output) self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens) + if should_sync_xdrope_positions: + self._calc_xdrope_positions(scheduler_output) + self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens) + return mm_embeds, is_mm_embed def get_model(self) -> nn.Module: @@ -2382,8 +2481,11 @@ class GPUModelRunner( input_ids = self.input_ids.gpu[:num_input_tokens] inputs_embeds = None model_kwargs = self._init_model_kwargs(num_input_tokens) + if self.uses_mrope: positions = self.mrope_positions.gpu[:, :num_input_tokens] + elif self.uses_xdrope_dim > 0: + positions = self.xdrope_positions.gpu[:, :num_input_tokens] else: positions = self.positions.gpu[:num_input_tokens] @@ -3644,6 +3746,7 @@ class GPUModelRunner( create_mixed_batch: bool = False, remove_lora: bool = True, activate_lora: bool = False, + is_graph_capturing: bool = False, ) -> tuple[torch.Tensor, torch.Tensor]: """ Run a dummy forward pass to warm up/profile run or capture the @@ -3822,6 +3925,8 @@ class GPUModelRunner( if self.uses_mrope: positions = self.mrope_positions.gpu[:, :num_tokens_after_padding] + elif self.uses_xdrope_dim > 0: + positions = self.xdrope_positions.gpu[:, :num_tokens_after_padding] else: positions = self.positions.gpu[:num_tokens_after_padding] @@ -3877,7 +3982,7 @@ class GPUModelRunner( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) use_cudagraphs = ( - cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) and not self.speculative_config.enforce_eager ) @@ -3891,6 +3996,7 @@ class GPUModelRunner( self.drafter.dummy_run( num_tokens, use_cudagraphs=use_cudagraphs, + is_graph_capturing=is_graph_capturing, ) # This is necessary to avoid blocking DP. @@ -4323,6 +4429,7 @@ class GPUModelRunner( skip_eplb=True, remove_lora=False, activate_lora=activate_lora, + is_graph_capturing=True, ) self.maybe_remove_all_loras(self.lora_config)