diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 03e2267a1b4ea..4cc9c70a6adbd 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -57,6 +57,7 @@ steps: agents: queue: tpu_queue_postmerge commands: + - "yes | docker system prune -a" - "git fetch --all" - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." - "docker push vllm/vllm-tpu:nightly" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d3c07cdda454d..84ee991f56599 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -293,6 +293,7 @@ steps: parallelism: 4 - label: PyTorch Compilation Unit Tests + torch_nightly: true source_file_dependencies: - vllm/ - tests/compile @@ -302,6 +303,7 @@ steps: - pytest -v -s compile/test_sequence_parallelism.py - label: PyTorch Fullgraph Smoke Test # 9min + torch_nightly: true source_file_dependencies: - vllm/ - tests/compile @@ -312,6 +314,7 @@ steps: - pytest -v -s compile/piecewise/test_toy_llama.py - label: PyTorch Fullgraph Test # 18min + torch_nightly: true source_file_dependencies: - vllm/ - tests/compile @@ -436,6 +439,7 @@ steps: ##### models test ##### - label: Basic Models Test # 24min + torch_nightly: true source_file_dependencies: - vllm/ - tests/models diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90ed492d992a7..5ecd7b70ea540 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,7 +46,7 @@ repos: rev: 0.6.17 hooks: - id: pip-compile - args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match] + args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128] files: ^requirements/test\.(in|txt)$ - repo: local hooks: diff --git a/CMakeLists.txt b/CMakeLists.txt index 72740279d0e05..d530646cd78bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX) # CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") - message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") @@ -250,9 +249,8 @@ set(VLLM_EXT_SRC if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") - # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case. - # Please keep this in sync with FetchContent_Declare line below. - set(CUTLASS_REVISION "v3.9.0" CACHE STRING "CUTLASS revision to use") + # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. + set(CUTLASS_REVISION "v3.9.1" CACHE STRING "CUTLASS revision to use") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) @@ -270,7 +268,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git # Please keep this in sync with CUTLASS_REVISION line above. - GIT_TAG v3.9.0 + GIT_TAG ${CUTLASS_REVISION} GIT_PROGRESS TRUE # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. @@ -682,6 +680,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") endif() endif() +if(VLLM_GPU_LANG STREQUAL "CUDA") + set(MOE_PERMUTE_SRC + "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu" + "csrc/moe/moe_permute_unpermute_op.cu") + + set_gencode_flags_for_srcs( + SRCS "${MARLIN_PERMUTE_SRC}" + CUDA_ARCHS "${MOE_PERMUTE_ARCHS}") + + list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}") +endif() message(STATUS "Enabling moe extension.") define_gpu_extension_target( _moe_C @@ -690,6 +699,8 @@ define_gpu_extension_target( SOURCES ${VLLM_MOE_EXT_SRC} COMPILE_FLAGS ${VLLM_GPU_FLAGS} ARCHITECTURES ${VLLM_GPU_ARCHES} + INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR} + INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR} USE_SABI 3 WITH_SOABI) diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index bcdbf6c7551a3..c92ea43e8260d 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -90,7 +90,8 @@ def bench_run(results: list[benchmark.Measurement], model: str, score = torch.randn((m, num_experts), device="cuda", dtype=dtype) - topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False) + topk_weights, topk_ids, token_expert_indices = fused_topk( + a, score, topk, renormalize=False) def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index a274537a67515..9407747f78435 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -115,8 +115,8 @@ def benchmark_config(config: BenchmarkConfig, from vllm.model_executor.layers.fused_moe import override_config with override_config(config): if use_deep_gemm: - topk_weights, topk_ids = fused_topk(x, input_gating, topk, - False) + topk_weights, topk_ids, token_expert_indices = fused_topk( + x, input_gating, topk, False) return fused_experts( x, w1, @@ -442,8 +442,14 @@ class BenchmarkWorker: hidden_size, search_space, is_fp16, topk) - with torch.cuda.device(self.device_id) if current_platform.is_rocm( - ) else nullcontext(): + need_device_guard = False + if current_platform.is_rocm(): + visible_device = os.environ.get("ROCR_VISIBLE_DEVICES", None) + if visible_device != f"{self.device_id}": + need_device_guard = True + + with torch.cuda.device( + self.device_id) if need_device_guard else nullcontext(): for config in tqdm(search_space): try: kernel_time = benchmark_config( @@ -578,6 +584,15 @@ def main(args: argparse.Namespace): use_deep_gemm = bool(args.use_deep_gemm) + if current_platform.is_rocm() and "HIP_VISIBLE_DEVICES" in os.environ: + # Ray will set ROCR_VISIBLE_DEVICES for device visibility + logger.warning( + "Ray uses ROCR_VISIBLE_DEVICES to control device accessibility." + "Replacing HIP_VISIBLE_DEVICES with ROCR_VISIBLE_DEVICES.") + val = os.environ["HIP_VISIBLE_DEVICES"] + os.environ["ROCR_VISIBLE_DEVICES"] = val + del os.environ["HIP_VISIBLE_DEVICES"] + ray.init() num_gpus = int(ray.available_resources()["GPU"]) workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py new file mode 100644 index 0000000000000..937df96246514 --- /dev/null +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -0,0 +1,349 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +from typing import Any, TypedDict + +import ray +import torch +from transformers import AutoConfig + +from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + _moe_permute, _moe_unpermute_and_reduce) +from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import * +from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser + +FP8_DTYPE = current_platform.fp8_dtype() + + +class BenchmarkConfig(TypedDict): + BLOCK_SIZE_M: int + BLOCK_SIZE_N: int + BLOCK_SIZE_K: int + GROUP_SIZE_M: int + num_warps: int + num_stages: int + + +def benchmark_permute(num_tokens: int, + num_experts: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + use_customized_permute: bool = False) -> float: + # init_dtype = torch.float16 if use_fp8_w8a8 else dtype + hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) + # output_hidden_states = torch.empty_like(hidden_states) + if use_fp8_w8a8: + align_block_size = 128 # deepgemm needs 128 m aligned block + qhidden_states, scale = _fp8_quantize(hidden_states, None, None) + else: + align_block_size = None + qhidden_states = hidden_states + + gating_output = torch.randn(num_iters, + num_tokens, + num_experts, + dtype=torch.float32) + + input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) + topk_weights, topk_ids, token_expert_indices = fused_topk( + qhidden_states, input_gating, topk, False) + + def prepare(i: int): + input_gating.copy_(gating_output[i]) + + def run(): + if use_customized_permute: + (permuted_hidden_states, first_token_off, inv_perm_idx, + m_indices) = moe_permute( + qhidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + token_expert_indices=token_expert_indices, + topk=topk, + n_expert=num_experts, + n_local_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, + ) + else: + (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids, + inv_perm) = _moe_permute(qhidden_states, None, topk_ids, + num_experts, None, align_block_size) + + # JIT compilation & warmup + run() + torch.cuda.synchronize() + + # Capture 10 invocations with CUDA graph + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + for _ in range(10): + run() + torch.cuda.synchronize() + + # Warmup + for _ in range(5): + graph.replay() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies: list[float] = [] + for i in range(num_iters): + prepare(i) + torch.cuda.synchronize() + + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = sum(latencies) / (num_iters * 10) * 1000 # us + graph.reset() + return avg + + +def benchmark_unpermute(num_tokens: int, + num_experts: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + num_iters: int = 100, + use_customized_permute: bool = False) -> float: + # init_dtype = torch.float16 if use_fp8_w8a8 else dtype + hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) + output_hidden_states = torch.empty_like(hidden_states) + if use_fp8_w8a8: + align_block_size = 128 # deepgemm needs 128 m aligned block + qhidden_states, scale = _fp8_quantize(hidden_states, None, None) + else: + align_block_size = None + qhidden_states = hidden_states + + input_gating = torch.randn(num_tokens, num_experts, dtype=torch.float32) + + topk_weights, topk_ids, token_expert_indices = fused_topk( + qhidden_states, input_gating, topk, False) + + def prepare(): + if use_customized_permute: + (permuted_hidden_states, first_token_off, inv_perm_idx, + m_indices) = moe_permute( + qhidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + token_expert_indices=token_expert_indices, + topk=topk, + n_expert=num_experts, + n_local_expert=num_experts, + expert_map=None, + align_block_size=align_block_size, + ) + # convert to fp16/bf16 as gemm output + return (permuted_hidden_states.to(dtype), first_token_off, + inv_perm_idx, m_indices) + else: + (permuted_qhidden_states, a1q_scale, sorted_token_ids, expert_ids, + inv_perm) = _moe_permute(qhidden_states, None, topk_ids, + num_experts, None, align_block_size) + # convert to fp16/bf16 as gemm output + return (permuted_qhidden_states.to(dtype), a1q_scale, + sorted_token_ids, expert_ids, inv_perm) + + def run(input: tuple): + if use_customized_permute: + (permuted_hidden_states, first_token_off, inv_perm_idx, + m_indices) = input + moe_unpermute(permuted_hidden_states, topk_weights, topk_ids, + inv_perm_idx, first_token_off, topk, num_experts, + num_experts) + else: + (permuted_hidden_states, a1q_scale, sorted_token_ids, expert_ids, + inv_perm) = input + _moe_unpermute_and_reduce(output_hidden_states, + permuted_hidden_states, inv_perm, + topk_weights) + + # JIT compilation & warmup + input = prepare() + run(input) + torch.cuda.synchronize() + + # Capture 10 invocations with CUDA graph + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + for _ in range(10): + run(input) + torch.cuda.synchronize() + + # Warmup + for _ in range(5): + graph.replay() + torch.cuda.synchronize() + + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + latencies: list[float] = [] + for i in range(num_iters): + torch.cuda.synchronize() + start_event.record() + graph.replay() + end_event.record() + end_event.synchronize() + latencies.append(start_event.elapsed_time(end_event)) + avg = sum(latencies) / (num_iters * 10) * 1000 # us + graph.reset() + return avg + + +@ray.remote(num_gpus=1) +class BenchmarkWorker: + + def __init__(self, seed: int) -> None: + torch.set_default_device("cuda") + current_platform.seed_everything(seed) + self.seed = seed + # Get the device ID to allocate tensors and kernels + # on the respective GPU. This is required for Ray to work + # correctly with multi-GPU tuning on the ROCm platform. + self.device_id = int(ray.get_gpu_ids()[0]) + + def benchmark( + self, + num_tokens: int, + num_experts: int, + hidden_size: int, + topk: int, + dtype: torch.dtype, + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + use_customized_permute: bool = False, + ) -> tuple[dict[str, int], float]: + current_platform.seed_everything(self.seed) + + permute_time = benchmark_permute( + num_tokens, + num_experts, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=100, + use_customized_permute=use_customized_permute) + unpermute_time = benchmark_unpermute( + num_tokens, + num_experts, + hidden_size, + topk, + dtype, + use_fp8_w8a8, + use_int8_w8a16, + num_iters=100, + use_customized_permute=use_customized_permute) + return permute_time, unpermute_time + + +def get_weight_block_size_safety(config, default_value=None): + + quantization_config = getattr(config, 'quantization_config', {}) + if isinstance(quantization_config, dict): + return quantization_config.get('weight_block_size', default_value) + return default_value + + +def main(args: argparse.Namespace): + print(args) + + config = AutoConfig.from_pretrained( + args.model, trust_remote_code=args.trust_remote_code) + if config.architectures[0] == "DbrxForCausalLM": + E = config.ffn_config.moe_num_experts + topk = config.ffn_config.moe_top_k + elif config.architectures[0] == "JambaForCausalLM": + E = config.num_experts + topk = config.num_experts_per_tok + elif (config.architectures[0] == "DeepseekV3ForCausalLM" + or config.architectures[0] == "DeepseekV2ForCausalLM"): + E = config.n_routed_experts + topk = config.num_experts_per_tok + elif config.architectures[0] in [ + "Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM" + ]: + E = config.num_experts + topk = config.num_experts_per_tok + + else: + # Support for llama4 + config = config.get_text_config() + # Default: Mixtral. + E = config.num_local_experts + topk = config.num_experts_per_tok + + hidden_size = config.hidden_size + dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype + use_fp8_w8a8 = args.dtype == "fp8_w8a8" + use_int8_w8a16 = args.dtype == "int8_w8a16" + use_customized_permute = args.use_customized_permute + + if args.batch_size is None: + batch_sizes = [ + 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536, + 2048, 3072, 4096 + ] + else: + batch_sizes = [args.batch_size] + + ray.init() + num_gpus = int(ray.available_resources()["GPU"]) + workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)] + + def _distribute(method: str, inputs: list[Any]) -> list[Any]: + outputs = [] + worker_idx = 0 + for input_args in inputs: + worker = workers[worker_idx] + worker_method = getattr(worker, method) + output = worker_method.remote(*input_args) + outputs.append(output) + worker_idx = (worker_idx + 1) % num_gpus + return ray.get(outputs) + + outputs = _distribute( + "benchmark", [(batch_size, E, hidden_size, topk, dtype, use_fp8_w8a8, + use_int8_w8a16, use_customized_permute) + for batch_size in batch_sizes]) + + for batch_size, (permute, unpermute) in zip(batch_sizes, outputs): + print(f"Batch size: {batch_size}") + print(f"Permute time: {permute:.2f} us") + print(f"Unpermute time: {unpermute:.2f} us") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser() + parser.add_argument("--model", + type=str, + default="mistralai/Mixtral-8x7B-Instruct-v0.1") + parser.add_argument("--dtype", + type=str, + choices=["auto", "fp8_w8a8", "int8_w8a16"], + default="auto") + parser.add_argument("--use-customized-permute", action="store_true") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--batch-size", type=int, required=False) + parser.add_argument("--trust-remote-code", action="store_true") + args = parser.parse_args() + + main(args) diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu new file mode 100644 index 0000000000000..76d5f0eab0218 --- /dev/null +++ b/csrc/moe/moe_permute_unpermute_op.cu @@ -0,0 +1,133 @@ +#include +#include +#include +#include "permute_unpermute_kernels/moe_permute_unpermute_kernel.h" +#include "permute_unpermute_kernels/dispatch.h" +#include "core/registration.h" + +void moe_permute( + const torch::Tensor& input, // [n_token, hidden] + const torch::Tensor& topk_weights, //[n_token, topk] + torch::Tensor& topk_ids, // [n_token, topk] + const torch::Tensor& token_expert_indicies, // [n_token, topk] + const std::optional& expert_map, // [n_expert] + int64_t n_expert, int64_t n_local_expert, int64_t topk, + const std::optional& align_block_size, + torch::Tensor& + permuted_input, // [topk * n_token/align_block_size_m, hidden] + torch::Tensor& expert_first_token_offset, // [n_local_expert + 1] + torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk] + torch::Tensor& m_indices) { // [align_expand_m] + TORCH_CHECK(topk_weights.scalar_type() == at::ScalarType::Float, + "topk_weights must be float32"); + TORCH_CHECK(expert_first_token_offset.scalar_type() == at::ScalarType::Long, + "expert_first_token_offset must be int64"); + TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, + "topk_ids must be int32"); + TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int, + "token_expert_indicies must be int32"); + TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int, + "src_row_id2dst_row_id_map must be int32"); + TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1, + "expert_first_token_offset shape != n_local_expert+1") + TORCH_CHECK( + src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(), + "token_expert_indicies shape must be same as src_row_id2dst_row_id_map"); + auto n_token = input.sizes()[0]; + auto n_hidden = input.sizes()[1]; + auto align_block_size_value = + align_block_size.has_value() ? align_block_size.value() : -1; + auto stream = at::cuda::getCurrentCUDAStream().stream(); + const long sorter_size = + CubKeyValueSorter::getWorkspaceSize(n_token * topk, n_expert); + auto sort_workspace = torch::empty( + {sorter_size}, + torch::dtype(torch::kInt8).device(torch::kCUDA).requires_grad(false)); + auto permuted_experts_id = torch::empty_like(topk_ids); + auto dst_row_id2src_row_id_map = torch::empty_like(src_row_id2dst_row_id_map); + auto align_expert_first_token_offset = + torch::zeros_like(expert_first_token_offset); + + CubKeyValueSorter sorter{}; + int64_t* valid_num_ptr = nullptr; + // pre-process kernel for expert-parallelism: + // no local expert id plus "n_expert" offset for priority to local expert + // map local expert id [n, .., n+n_local_expert-1] to [0, n_local_expert -1] + // For example, 4 expert with ep_size=2. ep_rank=1 owns global expert id + // [2,3] with expert_map[-1, -1, 0, 1], preprocess_topk_id process topk_ids + // and map global expert id [2, 3] to local_expert id [0, 1] and map global + // expert id [0, 1] ( not in ep rank=1) to [4, 5] by plus n_expert. This map + // operation is to make local expert high priority in following sort topk_ids + // and scan local expert_first_token_offset for each ep rank for next group + // gemm. + if (expert_map.has_value()) { + const int* expert_map_ptr = get_ptr(expert_map.value()); + valid_num_ptr = + get_ptr(expert_first_token_offset) + n_local_expert; + preprocessTopkIdLauncher(get_ptr(topk_ids), n_token * topk, + expert_map_ptr, n_expert, stream); + } + // expert sort topk expert id and scan expert id get expert_first_token_offset + sortAndScanExpert(get_ptr(topk_ids), get_ptr(token_expert_indicies), + get_ptr(permuted_experts_id), + get_ptr(dst_row_id2src_row_id_map), + get_ptr(expert_first_token_offset), n_token, + n_expert, n_local_expert, topk, sorter, + get_ptr(sort_workspace), stream); + + // dispatch expandInputRowsKernelLauncher + MOE_DISPATCH(input.scalar_type(), [&] { + expandInputRowsKernelLauncher( + get_ptr(input), get_ptr(permuted_input), + get_ptr(topk_weights), get_ptr(permuted_experts_id), + get_ptr(dst_row_id2src_row_id_map), + get_ptr(src_row_id2dst_row_id_map), + get_ptr(expert_first_token_offset), n_token, valid_num_ptr, + n_hidden, topk, n_local_expert, align_block_size_value, stream); + }); + + // get m_indices and update expert_first_token_offset with align block + getMIndices(get_ptr(expert_first_token_offset), + get_ptr(align_expert_first_token_offset), + get_ptr(m_indices), n_local_expert, align_block_size_value, + stream); + if (align_block_size.has_value()) { + // update align_expert_first_token_offset + expert_first_token_offset.copy_(align_expert_first_token_offset); + } +} + +void moe_unpermute( + const torch::Tensor& permuted_hidden_states, // [n_token * topk, hidden] + const torch::Tensor& topk_weights, //[n_token, topk] + const torch::Tensor& topk_ids, // [n_token, topk] + const torch::Tensor& src_row_id2dst_row_id_map, // [n_token, topk] + const torch::Tensor& expert_first_token_offset, // [n_local_expert+1] + int64_t n_expert, int64_t n_local_expert, int64_t topk, + torch::Tensor& hidden_states // [n_token, hidden] +) { + TORCH_CHECK(src_row_id2dst_row_id_map.sizes() == topk_ids.sizes(), + "topk_ids shape must be same as src_row_id2dst_row_id_map"); + TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int, + "topk_ids must be int32"); + TORCH_CHECK( + permuted_hidden_states.scalar_type() == hidden_states.scalar_type(), + "topk_ids dtype must be same as src_row_id2dst_row_id_map"); + auto n_token = hidden_states.size(0); + auto n_hidden = hidden_states.size(1); + auto stream = at::cuda::getCurrentCUDAStream().stream(); + const int64_t* valid_ptr = + get_ptr(expert_first_token_offset) + n_local_expert; + MOE_DISPATCH(hidden_states.scalar_type(), [&] { + finalizeMoeRoutingKernelLauncher( + get_ptr(permuted_hidden_states), + get_ptr(hidden_states), get_ptr(topk_weights), + get_ptr(src_row_id2dst_row_id_map), get_ptr(topk_ids), + n_token, n_hidden, topk, valid_ptr, stream); + }); +} + +TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { + m.impl("moe_permute", &moe_permute); + m.impl("moe_unpermute", &moe_unpermute); +} \ No newline at end of file diff --git a/csrc/moe/permute_unpermute_kernels/dispatch.h b/csrc/moe/permute_unpermute_kernels/dispatch.h new file mode 100644 index 0000000000000..41932cdd85bcd --- /dev/null +++ b/csrc/moe/permute_unpermute_kernels/dispatch.h @@ -0,0 +1,53 @@ +#pragma once +#include +#define MOE_SWITCH(TYPE, ...) \ + at::ScalarType _st = ::detail::scalar_type(TYPE); \ + switch (_st) { \ + __VA_ARGS__ \ + default: \ + TORCH_CHECK(false, "[moe permute]data type dispatch fail!") \ + } + +#define MOE_DISPATCH_CASE(enum_type, ...) \ + case enum_type: { \ + using scalar_t = ScalarType2CudaType::type; \ + __VA_ARGS__(); \ + break; \ + } +#define MOE_DISPATCH_FLOAT_CASE(...) \ + MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \ + MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) + +#define MOE_DISPATCH(TYPE, ...) \ + MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__)) + +template +struct ScalarType2CudaType; + +template <> +struct ScalarType2CudaType { + using type = float; +}; +template <> +struct ScalarType2CudaType { + using type = half; +}; +template <> +struct ScalarType2CudaType { + using type = __nv_bfloat16; +}; + +// #if __CUDA_ARCH__ >= 890 +// fp8 +template <> +struct ScalarType2CudaType { + using type = __nv_fp8_e5m2; +}; +template <> +struct ScalarType2CudaType { + using type = __nv_fp8_e4m3; +}; +// #endif \ No newline at end of file diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu new file mode 100644 index 0000000000000..aa353d0f0437f --- /dev/null +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu @@ -0,0 +1,229 @@ + +#include "moe_permute_unpermute_kernel.h" + +// CubKeyValueSorter definition begin +CubKeyValueSorter::CubKeyValueSorter() + : num_experts_(0), num_bits_(sizeof(int) * 8) {} + +int CubKeyValueSorter::expertsToBits(int num_experts) { + // Max value we represent is V = num_experts + (num_experts - 1) = 2 * + // num_experts - 1 The maximum number of bits is therefore floor(log2(V)) + 1 + return static_cast(log2(2 * num_experts - 1)) + 1; +} + +CubKeyValueSorter::CubKeyValueSorter(int const num_experts) + : num_experts_(num_experts), num_bits_(expertsToBits(num_experts)) {} + +void CubKeyValueSorter::updateNumExperts(int const num_experts) { + num_experts_ = num_experts; + num_bits_ = expertsToBits(num_experts); +} + +size_t CubKeyValueSorter::getWorkspaceSize(size_t const num_key_value_pairs, + int const num_experts) { + int num_bits = expertsToBits(num_experts); + size_t required_storage = 0; + int* null_int = nullptr; + cub::DeviceRadixSort::SortPairs(nullptr, required_storage, null_int, null_int, + null_int, null_int, num_key_value_pairs, 0, + num_bits); + + // when num_key_value_pairs, num_experts, num_bits, required_storage = 64, + // 4, 3, 0 The required_storage seems to vary between 0 and 1 for the same + // inputs + if (required_storage == 0) { + required_storage = 1; + } + return required_storage; +} + +void CubKeyValueSorter::run(void* workspace, size_t const workspace_size, + int const* keys_in, int* keys_out, + int const* values_in, int* values_out, + size_t const num_key_value_pairs, + cudaStream_t stream) { + size_t expected_ws_size = getWorkspaceSize(num_key_value_pairs, num_experts_); + size_t actual_ws_size = workspace_size; + + TORCH_CHECK(expected_ws_size <= workspace_size, + "[CubKeyValueSorter::run] The allocated workspace is too small " + "to run this problem."); + cub::DeviceRadixSort::SortPairs(workspace, actual_ws_size, keys_in, keys_out, + values_in, values_out, num_key_value_pairs, 0, + num_bits_, stream); +} +// CubKeyValueSorter definition end + +static inline size_t pad_to_multiple_of_16(size_t const& input) { + static constexpr int ALIGNMENT = 16; + return ALIGNMENT * ((input + ALIGNMENT - 1) / ALIGNMENT); +} +template +__device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices, + int64_t const arr_length, + T const target) { + int64_t low = 0, high = arr_length - 1, target_location = -1; + while (low <= high) { + int64_t mid = (low + high) / 2; + + if (sorted_indices[mid] >= target) { + high = mid - 1; + } else { + low = mid + 1; + target_location = mid; + } + } + return target_location + 1; +} + +// Calculates the start offset of the tokens for a given expert. The last +// element is the total number of valid tokens +__global__ void computeExpertFirstTokenOffsetKernel( + int const* sorted_experts, int64_t const sorted_experts_len, + int const num_experts, int64_t* expert_first_token_offset) { + // First, compute the global tid. We only need 1 thread per expert. + int const expert = blockIdx.x * blockDim.x + threadIdx.x; + + // Note that expert goes [0, num_experts] (inclusive) because we want a count + // for the total number of active tokens at the end of the scan. + if (expert >= num_experts + 1) { + return; + } + expert_first_token_offset[expert] = + findTotalEltsLessThanTarget(sorted_experts, sorted_experts_len, expert); +} + +void computeExpertFirstTokenOffset(int const* sorted_indices, + int const total_indices, + int const num_experts, + int64_t* expert_first_token_offset, + cudaStream_t stream) { + int const num_entries = num_experts + 1; + int const threads = std::min(1024, num_entries); + int const blocks = (num_entries + threads - 1) / threads; + + computeExpertFirstTokenOffsetKernel<<>>( + sorted_indices, total_indices, num_experts, expert_first_token_offset); +} + +void sortAndScanExpert(int* expert_for_source_row, const int* source_rows, + int* permuted_experts, int* permuted_rows, + int64_t* expert_first_token_offset, int num_rows, + int num_experts, int num_experts_per_node, int k, + CubKeyValueSorter& sorter, void* sorter_ws, + cudaStream_t stream) { + int64_t const expanded_num_rows = static_cast(k) * num_rows; + // We need to use the full num_experts because that is the sentinel value used + // by topk for disabled experts + sorter.updateNumExperts(num_experts); + size_t const sorter_ws_size_bytes = pad_to_multiple_of_16( + sorter.getWorkspaceSize(expanded_num_rows, num_experts)); + sorter.run((void*)sorter_ws, sorter_ws_size_bytes, expert_for_source_row, + permuted_experts, source_rows, permuted_rows, expanded_num_rows, + stream); + computeExpertFirstTokenOffset(permuted_experts, expanded_num_rows, + num_experts_per_node, expert_first_token_offset, + stream); +} + +__global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size, + const int* expert_map_ptr, + int num_experts) { + auto tidx = threadIdx.x; + auto bidx = blockIdx.x; + auto lidx = tidx & 31; + auto widx = tidx >> 5; + auto warp_count = (blockDim.x + 31) >> 5; + auto offset = bidx * blockDim.x; + auto bound = min(offset + blockDim.x, size); + extern __shared__ int smem_expert_map[]; + // store expert_map in smem + for (int i = tidx; i < num_experts; i += blockDim.x) { + smem_expert_map[i] = expert_map_ptr[i]; + } + __syncthreads(); + + // query global expert id in expert map. + // if global expert id = -1 in exert map, plus n_expert + // else set global expert id = exert map[global expert id] + if (offset + tidx < bound) { + auto topk_id = topk_id_ptr[offset + tidx]; + auto local_expert_idx = smem_expert_map[topk_id]; + if (local_expert_idx == -1) { + topk_id += num_experts; + } else { + topk_id = local_expert_idx; + } + __syncwarp(); + topk_id_ptr[offset + tidx] = topk_id; + } +} +void preprocessTopkIdLauncher(int* topk_id_ptr, int size, + const int* expert_map_ptr, int num_experts, + cudaStream_t stream) { + int block = std::min(size, 1024); + int grid = (size + block - 1) / block; + int smem_size = (num_experts) * sizeof(int); + preprocessTopkIdKernel<<>>( + topk_id_ptr, size, expert_map_ptr, num_experts); +} + +template +__global__ void getMIndicesKernel(int64_t* expert_first_token_offset, + int64_t* align_expert_first_token_offset, + int* m_indices, const int num_local_expert, + const int align_block_size) { + int eidx = blockIdx.x; + int tidx = threadIdx.x; + extern __shared__ int64_t smem_expert_first_token_offset[]; + for (int i = tidx; i <= num_local_expert; i += blockDim.x) { + smem_expert_first_token_offset[tidx] = __ldg(expert_first_token_offset + i); + } + __syncthreads(); + auto last_token_offset = smem_expert_first_token_offset[eidx + 1]; + auto first_token_offset = smem_expert_first_token_offset[eidx]; + int n_token_in_expert = last_token_offset - first_token_offset; + + if constexpr (ALIGN_BLOCK_SIZE) { + n_token_in_expert = (n_token_in_expert + align_block_size - 1) / + align_block_size * align_block_size; + // round up to ALIGN_BLOCK_SIZE + int64_t accumulate_align_offset = 0; + for (int i = 1; i <= eidx + 1; i++) { + int n_token = smem_expert_first_token_offset[i] - + smem_expert_first_token_offset[i - 1]; + accumulate_align_offset = + accumulate_align_offset + (n_token + align_block_size - 1) / + align_block_size * align_block_size; + if (i == eidx) { + first_token_offset = accumulate_align_offset; + } + // last block store align_expert_first_token_offset + if (eidx == num_local_expert - 1 && threadIdx.x == 0) { + align_expert_first_token_offset[i] = accumulate_align_offset; + } + } + } + for (int idx = tidx; idx < n_token_in_expert; idx += blockDim.x) { + // update m_indice with expert id + m_indices[first_token_offset + idx] = eidx; + } +} + +void getMIndices(int64_t* expert_first_token_offset, + int64_t* align_expert_first_token_offset, int* m_indices, + int num_local_expert, const int align_block_size, + cudaStream_t stream) { + int block = 256; + int grid = num_local_expert; + int smem_size = sizeof(int64_t) * (num_local_expert + 1); + if (align_block_size == -1) { + getMIndicesKernel<<>>( + expert_first_token_offset, align_expert_first_token_offset, m_indices, + num_local_expert, align_block_size); + } else { + getMIndicesKernel<<>>( + expert_first_token_offset, align_expert_first_token_offset, m_indices, + num_local_expert, align_block_size); + } +} \ No newline at end of file diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h new file mode 100644 index 0000000000000..43c29721cd16e --- /dev/null +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.h @@ -0,0 +1,95 @@ +#pragma once +// reference from tensorrt_llm moe kernel implementation archive in +// https://github.com/BBuf/tensorrt-llm-moe/tree/master + +#include +#include +#include "dispatch.h" +#include +#include +#include +#include "cutlass/numeric_size.h" +#include "cutlass/array.h" + +template +inline T* get_ptr(torch::Tensor& t) { + return reinterpret_cast(t.data_ptr()); +} + +template +inline const T* get_ptr(const torch::Tensor& t) { + return reinterpret_cast(t.data_ptr()); +} + +class CubKeyValueSorter { + public: + CubKeyValueSorter(); + + CubKeyValueSorter(int const num_experts); + + void updateNumExperts(int const num_experts); + + static size_t getWorkspaceSize(size_t const num_key_value_pairs, + int const num_experts); + + void run(void* workspace, size_t const workspace_size, int const* keys_in, + int* keys_out, int const* values_in, int* values_out, + size_t const num_key_value_pairs, cudaStream_t stream); + + private: + static int expertsToBits(int experts); + int num_experts_; + int num_bits_; +}; + +void computeExpertFirstTokenOffset(int const* sorted_indices, + int const total_indices, + int const num_experts, + int64_t* expert_first_token_offset, + cudaStream_t stream); + +void sortAndScanExpert(int* expert_for_source_row, const int* source_rows, + int* permuted_experts, int* permuted_rows, + int64_t* expert_first_token_offset, int num_rows, + int num_experts, int num_experts_per_node, int k, + CubKeyValueSorter& sorter, void* sorter_ws, + cudaStream_t stream); + +template +void expandInputRowsKernelLauncher( + T const* unpermuted_input, T* permuted_output, + const float* unpermuted_scales, int* sorted_experts, + int const* expanded_dest_row_to_expanded_source_row, + int* expanded_source_row_to_expanded_dest_row, + int64_t* expert_first_token_offset, int64_t const num_rows, + int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, + int num_local_experts, const int& align_block_size, cudaStream_t stream); + +// Final kernel to unpermute and scale +// This kernel unpermutes the original data, does the k-way reduction and +// performs the final skip connection. +template +__global__ void finalizeMoeRoutingKernel( + T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, + float const* scales, int const* expanded_source_row_to_expanded_dest_row, + int const* expert_for_source_row, int64_t const orig_cols, int64_t const k, + int64_t const* num_valid_ptr); + +template +void finalizeMoeRoutingKernelLauncher( + T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, + float const* scales, int const* expanded_source_row_to_expanded_dest_row, + int const* expert_for_source_row, int64_t const num_rows, + int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, + cudaStream_t stream); + +void preprocessTopkIdLauncher(int* topk_id_ptr, int size, + const int* expert_map_ptr, int num_experts, + cudaStream_t stream); + +void getMIndices(int64_t* expert_first_token_offset, + int64_t* align_expert_first_token_offset, int* m_indices, + int num_local_expert, const int align_block_size, + cudaStream_t stream); + +#include "moe_permute_unpermute_kernel.inl" diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl new file mode 100644 index 0000000000000..42441800fb110 --- /dev/null +++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.inl @@ -0,0 +1,211 @@ +#pragma once + +template +__global__ void expandInputRowsKernel( + T const* unpermuted_input, T* permuted_output, + const float* unpermuted_scales, int* sorted_experts, + int const* expanded_dest_row_to_expanded_source_row, + int* expanded_source_row_to_expanded_dest_row, + int64_t* expert_first_token_offset, int64_t const num_rows, + int64_t const* num_dest_rows, int64_t const cols, int64_t k, + int num_local_experts, int align_block_size) { + // Reverse permutation map. + // I do this so that later, we can use the source -> dest map to do the k-way + // reduction and unpermuting. I need the reverse map for that reduction to + // allow each threadblock to do 1 k-way reduce without atomics later in MoE. 1 + // thread block will be responsible for all k summations. + int64_t expanded_dest_row = blockIdx.x; + int64_t const expanded_source_row = + expanded_dest_row_to_expanded_source_row[expanded_dest_row]; + int expert_id = sorted_experts[expanded_dest_row]; + + extern __shared__ int64_t smem_expert_first_token_offset[]; + int64_t align_expanded_row_accumulate = 0; + if constexpr (ALIGN_BLOCK_SIZE) { + // load g2s + for (int idx = threadIdx.x; idx < num_local_experts + 1; + idx += blockDim.x) { + smem_expert_first_token_offset[idx] = + __ldg(expert_first_token_offset + idx); + } + __syncthreads(); + int lane_idx = threadIdx.x & 31; + + if (lane_idx == 0) { + // set token_offset_in_expert = 0 if this expert is not local expert + int token_offset_in_expert = + expert_id >= num_local_experts + ? 0 + : expanded_dest_row - smem_expert_first_token_offset[expert_id]; + int64_t accumulate_align_offset = 0; +#pragma unroll 1 + for (int eidx = 1; eidx <= min(expert_id, num_local_experts); eidx++) { + auto n_token_in_expert = smem_expert_first_token_offset[eidx] - + smem_expert_first_token_offset[eidx - 1]; + accumulate_align_offset += (n_token_in_expert + align_block_size - 1) / + align_block_size * align_block_size; + } + expanded_dest_row = accumulate_align_offset + token_offset_in_expert; + } + // lane0 shuffle broadcast align_expanded_dest_row + expanded_dest_row = __shfl_sync(0xffffffff, expanded_dest_row, 0); + } + + if (threadIdx.x == 0) { + assert(expanded_dest_row <= INT32_MAX); + expanded_source_row_to_expanded_dest_row[expanded_source_row] = + static_cast(expanded_dest_row); + } + + if (!CHECK_SKIPPED || blockIdx.x < *num_dest_rows) { + // Load 128-bits per thread + constexpr int64_t ELEM_PER_THREAD = 128 / cutlass::sizeof_bits::value; + using DataElem = cutlass::Array; + + // Duplicate and permute rows + int64_t const source_k_rank = expanded_source_row / num_rows; + int64_t const source_row = expanded_source_row % num_rows; + + auto const* source_row_ptr = + reinterpret_cast(unpermuted_input + source_row * cols); + auto* dest_row_ptr = + reinterpret_cast(permuted_output + expanded_dest_row * cols); + + int64_t const start_offset = threadIdx.x; + int64_t const stride = blockDim.x; + int64_t const num_elems_in_col = cols / ELEM_PER_THREAD; + + for (int elem_index = start_offset; elem_index < num_elems_in_col; + elem_index += stride) { + dest_row_ptr[elem_index] = source_row_ptr[elem_index]; + } + } +} + +template +void expandInputRowsKernelLauncher( + T const* unpermuted_input, T* permuted_output, + const float* unpermuted_scales, int* sorted_experts, + int const* expanded_dest_row_to_expanded_source_row, + int* expanded_source_row_to_expanded_dest_row, + int64_t* expert_first_token_offset, int64_t const num_rows, + int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k, + int num_local_experts, const int& align_block_size, cudaStream_t stream) { + int64_t const blocks = num_rows * k; + int64_t const threads = 256; + using FuncPtr = decltype(&expandInputRowsKernel); + FuncPtr func_map[2][2] = { + {&expandInputRowsKernel, + &expandInputRowsKernel}, + {&expandInputRowsKernel, + &expandInputRowsKernel}, + }; + bool is_check_skip = num_valid_tokens_ptr != nullptr; + bool is_align_block_size = align_block_size != -1; + auto func = func_map[is_check_skip][is_align_block_size]; + + int64_t smem_size = sizeof(int64_t) * (num_local_experts + 1); + + func<<>>( + unpermuted_input, permuted_output, unpermuted_scales, sorted_experts, + expanded_dest_row_to_expanded_source_row, + expanded_source_row_to_expanded_dest_row, expert_first_token_offset, + num_rows, num_valid_tokens_ptr, cols, k, num_local_experts, + align_block_size); +} + +template +__host__ __device__ constexpr static U arrayConvert(T const& input) { + using Type = typename U::Element; + static_assert(T::kElements == U::kElements); + U u; +#pragma unroll + for (int i = 0; i < U::kElements; i++) { + u[i] = static_cast(input[i]); + } + return u; +} + +template +__global__ void finalizeMoeRoutingKernel( + T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, + float const* scales, int const* expanded_source_row_to_expanded_dest_row, + int const* expert_for_source_row, int64_t const orig_cols, int64_t const k, + int64_t const* num_valid_ptr) { + assert(orig_cols % 4 == 0); + int64_t const original_row = blockIdx.x; + int64_t const num_rows = gridDim.x; + auto const offset = original_row * orig_cols; + OutputType* reduced_row_ptr = reduced_unpermuted_output + offset; + int64_t const num_valid = *num_valid_ptr; + + // Load 128-bits per thread, according to the smallest data type we read/write + constexpr int64_t FINALIZE_ELEM_PER_THREAD = + 128 / std::min(cutlass::sizeof_bits::value, + cutlass::sizeof_bits::value); + + int64_t const start_offset = threadIdx.x; + int64_t const stride = blockDim.x; + int64_t const num_elems_in_col = orig_cols / FINALIZE_ELEM_PER_THREAD; + + using InputElem = cutlass::Array; + using OutputElem = cutlass::Array; + using ComputeElem = cutlass::Array; + auto const* expanded_permuted_rows_v = + reinterpret_cast(expanded_permuted_rows); + auto* reduced_row_ptr_v = reinterpret_cast(reduced_row_ptr); + +#pragma unroll + for (int elem_index = start_offset; elem_index < num_elems_in_col; + elem_index += stride) { + ComputeElem thread_output; + thread_output.fill(0); + float row_rescale{0.f}; + for (int k_idx = 0; k_idx < k; ++k_idx) { + int64_t const expanded_original_row = original_row + k_idx * num_rows; + int64_t const expanded_permuted_row = + expanded_source_row_to_expanded_dest_row[expanded_original_row]; + + int64_t const k_offset = original_row * k + k_idx; + float const row_scale = scales[k_offset]; + + // Check after row_rescale has accumulated + if (CHECK_SKIPPED && expanded_permuted_row >= num_valid) { + continue; + } + + auto const* expanded_permuted_rows_row_ptr = + expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col; + + int64_t const expert_idx = expert_for_source_row[k_offset]; + + ComputeElem expert_result = arrayConvert( + expanded_permuted_rows_row_ptr[elem_index]); + thread_output = thread_output + row_scale * (expert_result); + } + + OutputElem output_elem = + arrayConvert(thread_output); + reduced_row_ptr_v[elem_index] = output_elem; + } +} + +template +void finalizeMoeRoutingKernelLauncher( + T const* expanded_permuted_rows, OutputType* reduced_unpermuted_output, + float const* scales, int const* expanded_source_row_to_expanded_dest_row, + int const* expert_for_source_row, int64_t const num_rows, + int64_t const cols, int64_t const k, int64_t const* num_valid_ptr, + cudaStream_t stream) { + int64_t const blocks = num_rows; + int64_t const threads = 256; + bool const check_finished = num_valid_ptr != nullptr; + using FuncPtr = decltype(&finalizeMoeRoutingKernel); + FuncPtr func_map[2] = {&finalizeMoeRoutingKernel, + &finalizeMoeRoutingKernel}; + auto* const kernel = func_map[check_finished]; + kernel<<>>( + expanded_permuted_rows, reduced_unpermuted_output, scales, + expanded_source_row_to_expanded_dest_row, expert_for_source_row, cols, k, + num_valid_ptr); +} diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index d0de42251f97a..2a8b9bb39caa9 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -53,7 +53,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "int size_m, int size_n, int size_k," "bool is_full_k, bool use_atomic_add," "bool use_fp32_reduce, bool is_zp_float) -> Tensor"); + m.def( + "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " + "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " + "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, " + "int b_q_type, SymInt size_m, " + "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int " + "topk, " + "int moe_block_size, bool replicate_input, bool apply_weights)" + " -> Tensor"); + m.def( + "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids," + "Tensor token_expert_indicies, Tensor? expert_map, int n_expert," + "int n_local_expert," + "int topk, int? align_block_size,Tensor! permuted_input, Tensor! " + "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! " + "m_indices)->()"); + + m.def( + "moe_unpermute(Tensor permuted_hidden_states, Tensor topk_weights," + "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor " + "expert_first_token_offset, int n_expert, int n_local_expert,int " + "topk, Tensor! hidden_states)->()"); // conditionally compiled so impl registration is in source file #endif diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index 7c4016cae1d34..89b31f0311e23 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -58,6 +58,12 @@ Therefore, we recommend developing with Python 3.12 to minimise the chance of yo Currently, the repository is not fully checked by `mypy`. ::: +:::{note} +Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU +platform to run unit tests locally, rely on the continuous integration system to run the tests for +now. +::: + ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index b90bb49ef87ea..95e105357bd3c 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -30,6 +30,7 @@ from vllm import LLM model = LLM("facebook/opt-125m", quantization="fp8") # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB result = model.generate("Hello, my name is") +print(result[0].outputs[0].text) ``` :::{warning} @@ -105,7 +106,8 @@ Load and run the model in `vllm`: ```python from vllm import LLM model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") -model.generate("Hello my name is") +result = model.generate("Hello my name is") +print(result[0].outputs[0].text) ``` Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): @@ -188,4 +190,5 @@ from vllm import LLM model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/") # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB result = model.generate("Hello, my name is") +print(result[0].outputs[0].text) ``` diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index c7c8aeb662a56..7ad46b7094ee9 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -17,6 +17,7 @@ gptqmodel int4 int8 fp8 +modelopt quark quantized_kvcache torchao diff --git a/docs/source/features/quantization/modelopt.md b/docs/source/features/quantization/modelopt.md new file mode 100644 index 0000000000000..001d18657dad0 --- /dev/null +++ b/docs/source/features/quantization/modelopt.md @@ -0,0 +1,78 @@ +# NVIDIA TensorRT Model Optimizer + +The [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a library designed to optimize models for inference with NVIDIA GPUs. It includes tools for Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) of Large Language Models (LLMs), Vision Language Models (VLMs), and diffusion models. + +We recommend installing the library with: + +```console +pip install nvidia-modelopt +``` + +## Quantizing HuggingFace Models with PTQ + +You can quantize HuggingFace models using the example scripts provided in the TensorRT Model Optimizer repository. The primary script for LLM PTQ is typically found within the `examples/llm_ptq` directory. + +Below is an example showing how to quantize a model using modelopt's PTQ API: + +```python +import modelopt.torch.quantization as mtq +from transformers import AutoModelForCausalLM + +# Load the model from HuggingFace +model = AutoModelForCausalLM.from_pretrained("") + +# Select the quantization config, for example, FP8 +config = mtq.FP8_DEFAULT_CFG + +# Define a forward loop function for calibration +def forward_loop(model): + for data in calib_set: + model(data) + +# PTQ with in-place replacement of quantized modules +model = mtq.quantize(model, config, forward_loop) +``` + +After the model is quantized, you can export it to a quantized checkpoint using the export API: + +```python +import torch +from modelopt.torch.export import export_hf_checkpoint + +with torch.inference_mode(): + export_hf_checkpoint( + model, # The quantized model. + export_dir, # The directory where the exported files will be stored. + ) +``` + +The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM: + +```python +from vllm import LLM, SamplingParams + +def main(): + + model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" + # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint + llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) + + sampling_params = SamplingParams(temperature=0.8, top_p=0.9) + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +if __name__ == "__main__": + main() +``` diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md index 08893f0e95959..f8af1ba60b125 100644 --- a/docs/source/features/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -129,7 +129,17 @@ The table below shows the compatibility of various quantization implementations * ❌ * ❌ * ❌ - +- * modelopt + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎︎ + * ❌ + * ❌ + * ❌ + * ❌ + * ❌ ::: - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index c75a990120e07..c2c28d5ae6ae5 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -47,8 +47,7 @@ def get_mixed_modalities_query() -> QueryResult: "image": ImageAsset("cherry_blossom").pil_image.convert("RGB"), "video": - VideoAsset(name="sample_demo_1.mp4", - num_frames=16).np_ndarrays, + VideoAsset(name="baby_reading", num_frames=16).np_ndarrays, }, }, limit_mm_per_prompt={ @@ -66,7 +65,7 @@ def get_use_audio_in_video_query() -> QueryResult: "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>" f"{question}<|im_end|>\n" f"<|im_start|>assistant\n") - asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16) + asset = VideoAsset(name="baby_reading", num_frames=16) audio = asset.get_audio(sampling_rate=16000) assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. " "Please launch this example with " diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 755e19bb26998..aca11f5c50ba3 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1109,7 +1109,7 @@ def get_multi_modal_input(args): if args.modality == "video": # Input video and question - video = VideoAsset(name="sample_demo_1.mp4", + video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays vid_questions = ["Why is this video funny?"] diff --git a/requirements/neuron.txt b/requirements/neuron.txt index 5f25bd0546e69..f8e3030834e25 100644 --- a/requirements/neuron.txt +++ b/requirements/neuron.txt @@ -2,5 +2,7 @@ -r common.txt # Dependencies for Neuron devices +packaging>=24.2 +setuptools>=77.0.3,<80.0.0 torch-neuronx >= 2.5.0 neuronx-cc diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 199bcafe0bdd0..e2711354ac10e 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -23,5 +23,11 @@ runai-model-streamer-s3==0.11.0 tensorizer>=2.9.0 lm-eval==0.4.8 buildkite-test-collector==0.1.9 - lm-eval[api]==0.4.8 # required for model evaluation test + +# required for quantization test +bitsandbytes>=0.45.3 + +# required for minicpmo_26 test +vector_quantize_pytorch +vocos diff --git a/requirements/test.txt b/requirements/test.txt index d4c92f15025fe..9a15d9a0d8240 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match +# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu128 absl-py==2.1.0 # via rouge-score accelerate==1.0.1 @@ -349,28 +349,28 @@ numpy==1.26.4 # transformers # tritonclient # vocos -nvidia-cublas-cu12==12.6.4.1 +nvidia-cublas-cu12==12.8.3.14 # via # nvidia-cudnn-cu12 # nvidia-cusolver-cu12 # torch -nvidia-cuda-cupti-cu12==12.6.80 +nvidia-cuda-cupti-cu12==12.8.57 # via torch -nvidia-cuda-nvrtc-cu12==12.6.77 +nvidia-cuda-nvrtc-cu12==12.8.61 # via torch -nvidia-cuda-runtime-cu12==12.6.77 +nvidia-cuda-runtime-cu12==12.8.57 # via torch -nvidia-cudnn-cu12==9.5.1.17 +nvidia-cudnn-cu12==9.7.1.26 # via torch -nvidia-cufft-cu12==11.3.0.4 +nvidia-cufft-cu12==11.3.3.41 # via torch -nvidia-cufile-cu12==1.11.1.6 +nvidia-cufile-cu12==1.13.0.11 # via torch -nvidia-curand-cu12==10.3.7.77 +nvidia-curand-cu12==10.3.9.55 # via torch -nvidia-cusolver-cu12==11.7.1.2 +nvidia-cusolver-cu12==11.7.2.55 # via torch -nvidia-cusparse-cu12==12.5.4.2 +nvidia-cusparse-cu12==12.5.7.53 # via # nvidia-cusolver-cu12 # torch @@ -378,13 +378,13 @@ nvidia-cusparselt-cu12==0.6.3 # via torch nvidia-nccl-cu12==2.26.2 # via torch -nvidia-nvjitlink-cu12==12.6.85 +nvidia-nvjitlink-cu12==12.8.61 # via # nvidia-cufft-cu12 # nvidia-cusolver-cu12 # nvidia-cusparse-cu12 # torch -nvidia-nvtx-cu12==12.6.77 +nvidia-nvtx-cu12==12.8.55 # via torch opencv-python-headless==4.11.0.86 # via @@ -687,7 +687,7 @@ tomli==2.2.1 # via schemathesis tomli-w==1.2.0 # via schemathesis -torch==2.7.0 +torch==2.7.0+cu128 # via # -r requirements/test.in # accelerate @@ -705,12 +705,12 @@ torch==2.7.0 # torchvision # vector-quantize-pytorch # vocos -torchaudio==2.7.0 +torchaudio==2.7.0+cu128 # via # -r requirements/test.in # encodec # vocos -torchvision==0.22.0 +torchvision==0.22.0+cu128 # via # -r requirements/test.in # timm diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 0b76779b3a752..b6b45d1cbe880 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -103,7 +103,8 @@ def test_compile_correctness( method = test_setting.method fullgraph = test_setting.fullgraph if cuda_device_count_stateless() != pp_size * tp_size: - pytest.skip("Not correct CUDA devices for the test.") + pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got " + f"{cuda_device_count_stateless()}") with monkeypatch.context() as m: m.setenv("VLLM_ATTENTION_BACKEND", attn_backend) diff --git a/tests/conftest.py b/tests/conftest.py index f02b5a8c0520e..b1b4af86fab72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 - import json import os import tempfile -from collections import UserList from enum import Enum from typing import Any, Callable, Optional, TypedDict, TypeVar, Union @@ -58,16 +56,12 @@ def _read_prompts(filename: str) -> list[str]: return prompts -class _ImageAssetPrompts(TypedDict): +class ImageAssetPrompts(TypedDict): stop_sign: str cherry_blossom: str -class _ImageAssetsBase(UserList[ImageAsset]): - pass - - -class _ImageAssets(_ImageAssetsBase): +class ImageTestAssets(list[ImageAsset]): def __init__(self) -> None: super().__init__([ @@ -75,7 +69,7 @@ class _ImageAssets(_ImageAssetsBase): ImageAsset("cherry_blossom"), ]) - def prompts(self, prompts: _ImageAssetPrompts) -> list[str]: + def prompts(self, prompts: ImageAssetPrompts) -> list[str]: """ Convenience method to define the prompt for each test image. @@ -85,30 +79,27 @@ class _ImageAssets(_ImageAssetsBase): return [prompts["stop_sign"], prompts["cherry_blossom"]] -class _VideoAssetPrompts(TypedDict): - sample_demo_1: str +class VideoAssetPrompts(TypedDict): + baby_reading: str -class _VideoAssetsBase(UserList[VideoAsset]): - pass - - -class _VideoAssets(_VideoAssetsBase): +class VideoTestAssets(list[VideoAsset]): def __init__(self) -> None: super().__init__([ - VideoAsset("sample_demo_1.mp4"), + VideoAsset("baby_reading"), ]) - def prompts(self, prompts: _VideoAssetPrompts) -> list[str]: - return [prompts["sample_demo_1"]] + def prompts(self, prompts: VideoAssetPrompts) -> list[str]: + return [prompts["baby_reading"]] -class _AudioAssetsBase(UserList[AudioAsset]): - pass +class AudioAssetPrompts(TypedDict): + mary_had_lamb: str + winning_call: str -class _AudioAssets(_AudioAssetsBase): +class AudioTestAssets(list[AudioAsset]): def __init__(self) -> None: super().__init__([ @@ -116,13 +107,16 @@ class _AudioAssets(_AudioAssetsBase): AudioAsset("winning_call"), ]) + def prompts(self, prompts: AudioAssetPrompts) -> list[str]: + return [prompts["mary_had_lamb"], prompts["winning_call"]] -IMAGE_ASSETS = _ImageAssets() -"""Singleton instance of :class:`_ImageAssets`.""" -VIDEO_ASSETS = _VideoAssets() -"""Singleton instance of :class:`_VideoAssets`.""" -AUDIO_ASSETS = _AudioAssets() -"""Singleton instance of :class:`_AudioAssets`.""" + +IMAGE_ASSETS = ImageTestAssets() +"""Singleton instance of :class:`ImageTestAssets`.""" +VIDEO_ASSETS = VideoTestAssets() +"""Singleton instance of :class:`VideoTestAssets`.""" +AUDIO_ASSETS = AudioTestAssets() +"""Singleton instance of :class:`AudioTestAssets`.""" @pytest.fixture(scope="function", autouse=True) @@ -270,17 +264,17 @@ def example_long_prompts() -> list[str]: @pytest.fixture(scope="session") -def image_assets() -> _ImageAssets: +def image_assets() -> ImageTestAssets: return IMAGE_ASSETS @pytest.fixture(scope="session") -def video_assets() -> _VideoAssets: +def video_assets() -> VideoTestAssets: return VIDEO_ASSETS @pytest.fixture(scope="session") -def audio_assets() -> _AudioAssets: +def audio_assets() -> AudioTestAssets: return AUDIO_ASSETS @@ -779,7 +773,7 @@ class VllmRunner: def get_inputs( self, - prompts: list[str], + prompts: Union[list[str], list[torch.Tensor]], images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, @@ -801,16 +795,18 @@ class VllmRunner: if audios is not None and (audio := audios[i]) is not None: multi_modal_data["audio"] = audio - inputs.append( - TextPrompt(prompt=prompt, - multi_modal_data=multi_modal_data - if multi_modal_data else None)) + text_prompt_kwargs = { + ("prompt" if isinstance(prompt, str) else "prompt_embeds"): + prompt, + "multi_modal_data": multi_modal_data or None + } + inputs.append(TextPrompt(**text_prompt_kwargs)) return inputs def generate( self, - prompts: list[str], + prompts: Union[list[str], list[torch.Tensor]], sampling_params: SamplingParams, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, @@ -836,7 +832,7 @@ class VllmRunner: output_str = sample.text output_ids = list(sample.token_ids) req_sample_output_ids.append(prompt_ids + output_ids) - req_sample_output_strs.append(prompt_str + output_str) + req_sample_output_strs.append((prompt_str or "") + output_str) outputs.append((req_sample_output_ids, req_sample_output_strs)) return outputs @@ -903,7 +899,7 @@ class VllmRunner: def generate_greedy( self, - prompts: list[str], + prompts: Union[list[str], list[torch.Tensor]], max_tokens: int, images: Optional[PromptImageInput] = None, videos: Optional[PromptVideoInput] = None, diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 8bd64923fe229..a5ba16898d891 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -2,16 +2,18 @@ import time from collections import deque +from typing import Optional from unittest.mock import MagicMock import pytest # noqa +import torch from torch import Use # noqa from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.lora.request import LoRARequest -from vllm.sequence import SequenceGroup +from vllm.sequence import SequenceGroup, SequenceStatus from .utils import (append_new_token, append_new_token_seq, append_new_token_seq_group, create_dummy_prompt, @@ -968,3 +970,73 @@ def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching( ), "A partial prefix of C (4 tokens) should be prefilled, with the " "remaining tokens fit into 3 token budget (4-1 from the seqA). It will " "then be rounded down to 2 tokens on block size, thus 6 tokens in total." + + +def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds(): + """ + Test that the scheduler does not schedule batches with prompt tokens and + prompt embeddings co-mingled. + """ + block_size = 2 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_num_seqs=max_seq_group, + max_model_len=100, + enable_prefix_caching=True, + ) + + # the odd indexed inputs should be passed in via embeddings, + # evens via token_ids + seq_length = 7 + embedding_size = 5 + num_seqs = 11 + seq_tokens: list[list[int]] = [] + seq_embeds: list[Optional[torch.Tensor]] = [] + for i in range(num_seqs): + if i % 2: + seq_tokens.append(list(range(seq_length))) + seq_embeds.append(None) + else: + seq_tokens.append([0] * seq_length) + seq_embeds.append(torch.rand(embedding_size)) + + seq_and_seq_groups = [ + create_dummy_prompt(f"{i}", + prompt_tokens=seq_tokens[i], + prompt_embeds=seq_embeds[i], + block_size=block_size) + for i in range(len(seq_tokens)) + ] + + for _, seq_group in seq_and_seq_groups: + scheduler.add_seq_group(seq_group) + + while not all(seq.is_finished() for seq, _ in seq_and_seq_groups): + unfinished_seq_groups = [ + seq_group for _, seq_group in seq_and_seq_groups + if not seq_group.is_finished() + ] + _, out = schedule_and_update_computed_tokens(scheduler) + assert len(out.scheduled_seq_groups) > 0 + batch_is_prompt_embeds = out.scheduled_seq_groups[ + 0].seq_group.uses_prompt_embeds() + expected_scheduled_seq_groups = [ + seq_group for seq_group in unfinished_seq_groups + if seq_group.uses_prompt_embeds() == batch_is_prompt_embeds + ] + + # We should have as many scheduled groups as possible, without mixing + assert len(out.scheduled_seq_groups) == min( + max_seq_group, len(expected_scheduled_seq_groups)) + assert all(scheduled_seq_group.seq_group.uses_prompt_embeds() == + batch_is_prompt_embeds + for scheduled_seq_group in out.scheduled_seq_groups) + + # Finish the scheduled groups + for scheduled_seq_group in out.scheduled_seq_groups: + for seq in scheduled_seq_group.seq_group.seqs: + seq.status = SequenceStatus.FINISHED_STOPPED + scheduler.free_finished_seq_groups() diff --git a/tests/core/utils.py b/tests/core/utils.py index ea18b879a3172..84b0426b470bc 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -5,9 +5,11 @@ from collections import defaultdict from collections.abc import Sequence as GenericSequence from typing import Any, Optional +import torch + from vllm import SamplingParams from vllm.core.scheduler import Scheduler, SchedulerOutputs -from vllm.inputs import EncoderDecoderInputs, token_inputs +from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import (Logprob, Sequence, SequenceGroup, SequenceGroupMetadata) @@ -19,6 +21,7 @@ def create_dummy_prompt( block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, prompt_tokens: Optional[list[int]] = None, + prompt_embeds: Optional[torch.Tensor] = None, min_tokens: int = 0, max_tokens: int = 16, ) -> tuple[Sequence, SequenceGroup]: @@ -31,9 +34,13 @@ def create_dummy_prompt( prompt_tokens = list(range(prompt_length)) prompt_str = " ".join([str(t) for t in prompt_tokens]) + inputs = token_inputs( + prompt_token_ids=prompt_tokens, + prompt=prompt_str) if prompt_embeds is None else embeds_inputs( + prompt_embeds=prompt_embeds) prompt = Sequence( int(request_id), - inputs=token_inputs(prompt_tokens, prompt=prompt_str), + inputs=inputs, block_size=block_size, ) seq_group = SequenceGroup( diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 16721ee9ce747..65471cb3af38e 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -106,6 +106,8 @@ class DummyConfigClass: """List with literal choices""" literal_literal: Literal[Literal[1], Literal[2]] = 1 """Literal of literals with default 1""" + json_tip: dict = field(default_factory=dict) + """Dict which will be JSON in CLI""" @pytest.mark.parametrize(("type_hint", "expected"), [ @@ -137,6 +139,9 @@ def test_get_kwargs(): assert kwargs["list_literal"]["choices"] == [1, 2] # literals of literals should have merged choices assert kwargs["literal_literal"]["choices"] == [1, 2] + # dict should have json tip in help + json_tip = "\n\nShould be a valid JSON string." + assert kwargs["json_tip"]["help"].endswith(json_tip) @pytest.mark.parametrize(("arg", "expected"), [ diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py new file mode 100644 index 0000000000000..0cf4f69d56a87 --- /dev/null +++ b/tests/engine/test_options.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +from contextlib import nullcontext + +import pytest + +from vllm.entrypoints.llm import LLM +from vllm.sampling_params import SamplingParams + + +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) +def test_skip_tokenizer_initialization(model: str): + # This test checks if the flag skip_tokenizer_init skips the initialization + # of tokenizer and detokenizer. The generated output is expected to contain + # token ids. + llm = LLM( + model=model, + skip_tokenizer_init=True, + enforce_eager=True, + ) + sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) + + with pytest.raises(ValueError, match="cannot pass text prompts when"): + llm.generate("abc", sampling_params) + + outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, + sampling_params=sampling_params) + assert len(outputs) > 0 + completions = outputs[0].outputs + assert len(completions) > 0 + assert completions[0].text == "" + assert completions[0].token_ids + + +@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) +@pytest.mark.parametrize("enable_prompt_embeds", [True, False]) +def test_enable_prompt_embeds(hf_runner, model: str, + enable_prompt_embeds: bool): + prompt = "abc" + + with hf_runner(model) as hf_model: + token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids + token_ids = token_ids.to(hf_model.model.device) + + embed_layer = hf_model.model.get_input_embeddings() + prompt_embeds = embed_layer(token_ids).squeeze(0) + + ctx = (nullcontext() if enable_prompt_embeds else pytest.raises( + ValueError, match="set `--enable-prompt-embeds`")) + + # This test checks if the flag skip_tokenizer_init skips the initialization + # of tokenizer and detokenizer. The generated output is expected to contain + # token ids. + llm = LLM( + model=model, + enable_prompt_embeds=enable_prompt_embeds, + enforce_eager=True, + ) + + with ctx: + llm.generate({"prompt_embeds": prompt_embeds}) diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py deleted file mode 100644 index 5e197f5ffe592..0000000000000 --- a/tests/engine/test_skip_tokenizer_init.py +++ /dev/null @@ -1,29 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from vllm.entrypoints.llm import LLM -from vllm.sampling_params import SamplingParams - - -@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) -def test_skip_tokenizer_initialization(model: str): - # This test checks if the flag skip_tokenizer_init skips the initialization - # of tokenizer and detokenizer. The generated output is expected to contain - # token ids. - llm = LLM( - model=model, - skip_tokenizer_init=True, - ) - sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) - - with pytest.raises(ValueError, match="cannot pass text prompts when"): - llm.generate("abc", sampling_params) - - outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, - sampling_params=sampling_params) - assert len(outputs) > 0 - completions = outputs[0].outputs - assert len(completions) > 0 - assert completions[0].text == "" - assert completions[0].token_ids diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 425f36984a33b..f2cca65ae4209 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -420,7 +420,8 @@ def test_fused_marlin_moe( score = torch.randn((m, e), device="cuda", dtype=dtype) - topk_weights, topk_ids = fused_topk(a, score, topk, False) + topk_weights, topk_ids, token_expert_indices = fused_topk( + a, score, topk, False) torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map) diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py new file mode 100644 index 0000000000000..dfcd61f775870 --- /dev/null +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -0,0 +1,223 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the MOE permute/unpermute kernel + +Run `pytest tests/kernels/test_moe_permute_unpermute.py`. +""" + +from typing import Optional + +import numpy as np +import pytest +import torch + +from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.layer import determine_expert_map +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + moe_permute, moe_unpermute) +from vllm.platforms import current_platform + +NUM_EXPERTS = [16, 64] +TOP_KS = [2, 4, 6, 8] +EP_SIZE = [1, 4, 16] +current_platform.seed_everything(0) + + +def torch_permute(hidden_states: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indices: torch.Tensor, + topk: int, + n_expert: int, + n_local_expert: int, + start_expert: int, + expert_map: Optional[torch.Tensor] = None, + align_block_size: Optional[int] = None, + fill_invalid_expert: int = -1) -> list[torch.Tensor]: + n_token, n_hidden = hidden_states.shape[0], hidden_states.shape[1] + if expert_map is not None: + is_local_expert = (expert_map[topk_ids] != -1) + not_local_expert = (expert_map[topk_ids] == -1) + topk_ids = is_local_expert * ( + topk_ids - start_expert) + not_local_expert * (topk_ids + n_expert) + + sorted_topk_ids, sorted_indices = torch.sort(topk_ids.flatten(), + stable=True) + dst_row_id2src_row_id_map = token_expert_indices.flatten()[sorted_indices] + + expert_first_token_offset = torch.zeros(n_local_expert + 1, + dtype=torch.int64, + device="cuda") + idx = 0 + for i in range(0, n_local_expert): + cnt = 0 + while idx < sorted_topk_ids.numel() and sorted_topk_ids[idx] == i: + cnt += 1 + idx += 1 + expert_first_token_offset[i + 1] = expert_first_token_offset[i] + cnt + + _, src2dst_idx = torch.sort(dst_row_id2src_row_id_map) + valid_row_idx = [] + if align_block_size is None: + + permuted_hidden_states = hidden_states[dst_row_id2src_row_id_map % + n_token, ...] + permuted_row_size = permuted_hidden_states.shape[0] + m_indices = torch.empty(permuted_row_size, + device="cuda", + dtype=torch.int32).fill_(fill_invalid_expert) + for i in range(1, n_local_expert + 1): + first_token_offset = expert_first_token_offset[i - 1] + last_token_offset = expert_first_token_offset[i] + m_indices[first_token_offset:last_token_offset] = i - 1 + src_row_id2dst_row_id_map = torch.arange( + 0, n_token * topk, device="cuda", + dtype=torch.int32)[src2dst_idx].reshape((n_token, topk)) + valid_row_idx += [i for i in range(expert_first_token_offset[-1])] + return [ + permuted_hidden_states, expert_first_token_offset, + src_row_id2dst_row_id_map, m_indices, valid_row_idx + ] + else: + permuted_row_size = (topk * n_token + n_expert * + (align_block_size - 1) + align_block_size - + 1) // align_block_size * align_block_size + permuted_hidden_states = torch.empty((permuted_row_size, n_hidden), + device="cuda", + dtype=hidden_states.dtype) + align_src_row_id2dst_row_id = torch.empty(n_token * topk, + device="cuda", + dtype=torch.int32) + align_expert_first_token_offset = torch.zeros_like( + expert_first_token_offset) + m_indices = torch.empty(permuted_row_size, + device="cuda", + dtype=torch.int32).fill_(fill_invalid_expert) + # get align_permuted_hidden_states, + # valid row_idx and align_expert_first_token_offset + for i in range(1, n_local_expert + 1): + first_token_offset = expert_first_token_offset[i - 1] + last_token_offset = expert_first_token_offset[i] + n_token_in_expert = last_token_offset - first_token_offset + align_expert_first_token_offset[ + i] = align_expert_first_token_offset[ + i - 1] + (n_token_in_expert + align_block_size - + 1) // align_block_size * align_block_size + align_first_token_offset = align_expert_first_token_offset[i - 1] + align_last_token_offset = align_expert_first_token_offset[i] + dst_row_id2src_row_id_in_expert = dst_row_id2src_row_id_map[ + first_token_offset:first_token_offset + + n_token_in_expert] % n_token + # store token in current expert with align_first_token_offset + permuted_hidden_states[align_first_token_offset:\ + align_first_token_offset+n_token_in_expert,\ + ...] = hidden_states[\ + dst_row_id2src_row_id_in_expert, ...] + # set current expert m_indices + m_indices[align_first_token_offset:align_last_token_offset] = i - 1 + valid_row_idx += [ + i for i in range(align_first_token_offset, + align_first_token_offset + n_token_in_expert) + ] + # get align_src_row_id2dst_row_id + for i in range(n_token * topk): + eid = sorted_topk_ids[i] + if (eid >= n_local_expert): + # check token not in local expert + align_src_row_id2dst_row_id[ + i] = align_expert_first_token_offset[-1] + continue + first_token_offset = expert_first_token_offset[eid] + align_first_token_offset = align_expert_first_token_offset[eid] + token_offset = i - first_token_offset + align_src_row_id2dst_row_id[ + i] = align_first_token_offset + token_offset + align_src_row_id2dst_row_id = align_src_row_id2dst_row_id[\ + src2dst_idx].reshape((n_token, topk)) + return [ + permuted_hidden_states, align_expert_first_token_offset, + align_src_row_id2dst_row_id, m_indices, valid_row_idx + ] + + +def torch_unpermute(permuted_hidden_states: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + token_expert_indices: torch.Tensor, + src_row_id2dst_row_id_map: torch.Tensor, + valid_row_idx: torch.Tensor, topk: int, + n_expert: int) -> torch.Tensor: + # ignore invalid row + mask = torch.zeros(permuted_hidden_states.shape[0], + dtype=bool, + device="cuda") + mask[valid_row_idx] = True + permuted_hidden_states[~mask] = 0 + idx = src_row_id2dst_row_id_map.flatten()[ + token_expert_indices.flatten()].reshape(token_expert_indices.shape) + output = permuted_hidden_states[idx, ...] * topk_weights[..., None] + output = output.sum(dim=1).to(permuted_hidden_states.dtype) + return output + + +@pytest.mark.parametrize("n_token", [1, 33, 64, 222, 1024, 2048, 3000, 5000]) +@pytest.mark.parametrize("n_hidden", [2048, 4096, 7168]) +@pytest.mark.parametrize("n_expert", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("ep_size", EP_SIZE) +@pytest.mark.parametrize("align_block_size", [None, 128]) +def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, + n_expert: int, ep_size: int, dtype: torch.dtype, + align_block_size: Optional[int]): + fill_invalid_expert = 0 + ep_rank = np.random.randint(0, ep_size) + expert_map = None + n_local_expert = n_expert + if (ep_size != 1): + n_local_expert, expert_map = determine_expert_map( + ep_size, ep_rank, n_expert) + expert_map = expert_map.cuda() + start_expert = n_local_expert * ep_rank + current_platform.seed_everything(0) + hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype) + gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype) + topk_weights, topk_ids, token_expert_indices = fused_topk( + hidden_states, gating_output, topk, False) + gold0, gold1, gold2, gold3, valid_row_idx = torch_permute( + hidden_states, + topk_ids, + token_expert_indices, + topk, + n_expert, + n_local_expert, + start_expert, + expert_map=expert_map, + align_block_size=align_block_size, + fill_invalid_expert=fill_invalid_expert) + + result0, result1, result2, result3 = moe_permute( + hidden_states, topk_weights, topk_ids, token_expert_indices, topk, + n_expert, n_local_expert, expert_map, align_block_size, + fill_invalid_expert) + + # check expert_first_token_offset + torch.testing.assert_close(gold1, result1, atol=0, rtol=0) + # check src_row_id2dst_row_id_map + torch.testing.assert_close(gold2, result2, atol=0, rtol=0) + # check mindice + torch.testing.assert_close(gold3, result3, atol=0, rtol=0) + # check permuted_hidden_states, only valid token + torch.testing.assert_close(gold0[valid_row_idx], + result0[valid_row_idx], + atol=0, + rtol=0) + + # add a random tensor to simulate group gemm + result0 = 0.5 * result0 + torch.randn_like(result0) + + result4 = moe_unpermute(result0, topk_weights, topk_ids, result2, result1, + topk, n_expert, n_local_expert) + gold4 = torch_unpermute(result0, topk_weights, topk_ids, + token_expert_indices, result2, valid_row_idx, topk, + n_local_expert) + + # check unpermuted hidden + torch.testing.assert_close(result4, gold4, atol=2e-2, rtol=0) diff --git a/tests/kernels/quantization/test_awq_marlin.py b/tests/kernels/quantization/test_awq_marlin.py index 939b0e7157be7..c30fe60becdfd 100644 --- a/tests/kernels/quantization/test_awq_marlin.py +++ b/tests/kernels/quantization/test_awq_marlin.py @@ -84,7 +84,8 @@ def test_fused_marlin_moe_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) - topk_weights, topk_ids = fused_topk(a, score, topk, False) + topk_weights, topk_ids, token_expert_indices = fused_topk( + a, score, topk, False) marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index c57e39f425064..38c7e461bb9c4 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -338,7 +338,8 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk, M, K = a.shape N = w2.shape[-1] - topk_weight, topk_ids = fused_topk(a, score.float(), topk, False) + topk_weight, topk_ids, token_expert_indices = fused_topk( + a, score.float(), topk, False) block_m = deep_gemm.get_m_alignment_for_contiguous_layout() @@ -435,7 +436,8 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed): ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_size) - topk_weights, topk_ids = fused_topk(a, score.float(), topk, False) + topk_weights, topk_ids, token_expert_indices = fused_topk( + a, score.float(), topk, False) out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids) diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py index 5b9ea6dba401b..dc948a48bf326 100644 --- a/tests/kv_transfer/test_disagg.py +++ b/tests/kv_transfer/test_disagg.py @@ -14,8 +14,8 @@ import torch # Fixture to set up environment variables and teardown servers after tests @pytest.fixture(scope="module", autouse=True) def setup_servers(): - if torch.cuda.device_count() < 4: - pytest.skip("Skipping test: fewer than 4 GPUs available") + if torch.cuda.device_count() < 2: + pytest.skip("Skipping test: fewer than 2 GPUs available") # Set up environment variables VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index ab2898ffb2d0b..c755593c9acb8 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -1,4 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +import os +from typing import Optional + import pytest import torch @@ -106,19 +109,38 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, # in parts of the operators pytest.skip(f"Skipping '{model}' model test with AITER kernel.") + use_prompt_embeds = os.getenv("VLLM_USE_V1") == "0" + with hf_runner(model) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) + prompt_embeds: Optional[list[torch.Tensor]] = ([] if use_prompt_embeds + else None) + + prompt_token_ids = [] + for prompt in example_prompts: + token_ids = hf_model.tokenizer(prompt, + return_tensors="pt").input_ids.to( + hf_model.model.device) + prompt_token_ids.append(token_ids) + if prompt_embeds is not None: + prompt_embeds.append(hf_model.model.get_input_embeddings()( + token_ids).squeeze(0)) + with vllm_runner( model, tokenizer_name=model_info.tokenizer or model, tokenizer_mode=model_info.tokenizer_mode, trust_remote_code=model_info.trust_remote_code, max_num_seqs=2, + enable_prompt_embeds=use_prompt_embeds, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) + if prompt_embeds is not None: + vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs( + prompt_embeds, max_tokens, num_logprobs) check_logprobs_close( outputs_0_lst=hf_outputs, @@ -126,6 +148,14 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, name_0="hf", name_1="vllm", ) + if prompt_embeds is not None: + check_logprobs_close( + outputs_0_lst=vllm_outputs, + outputs_1_lst=vllm_outputs_from_embeds, + name_0="vllm", + name_1="vllm_from_embeds", + ) + if use_rocm_aiter: # this is to ensure that vllm engine # has deallocated the memory before running the next diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index b21c80bef927d..6e915a9f6005d 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -8,13 +8,14 @@ from collections import defaultdict from pathlib import PosixPath import pytest -from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq +from transformers import (AutoModelForImageTextToText, + AutoModelForTextToWaveform, AutoModelForVision2Seq) from vllm.platforms import current_platform from vllm.utils import identity -from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets, - _VideoAssets) +from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets, + VideoTestAssets, VllmRunner) from ....utils import (create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks) from ...utils import check_outputs_equal @@ -140,7 +141,7 @@ VLM_TEST_SETTINGS = { marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), "qwen2_5_omni": VLMTestInfo( - models=["Qwen/Qwen2.5-Omni-7B"], + models=["Qwen/Qwen2.5-Omni-3B"], test_type=( VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, @@ -151,8 +152,9 @@ VLM_TEST_SETTINGS = { video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501 max_model_len=4096, max_num_seqs=2, - auto_cls=AutoModelForVision2Seq, + auto_cls=AutoModelForTextToWaveform, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), @@ -691,7 +693,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, monkeypatch): + image_assets: ImageTestAssets, monkeypatch): if model_type in REQUIRES_V0_MODELS: monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] @@ -716,7 +718,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, monkeypatch): + image_assets: ImageTestAssets, monkeypatch): if model_type in REQUIRES_V0_MODELS: monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] @@ -741,7 +743,7 @@ def test_image_embedding_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, monkeypatch): + image_assets: ImageTestAssets, monkeypatch): if model_type in REQUIRES_V0_MODELS: monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] @@ -763,7 +765,7 @@ def test_image_embedding_models(model_type: str, )) def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - video_assets: _VideoAssets, monkeypatch): + video_assets: VideoTestAssets, monkeypatch): if model_type in REQUIRES_V0_MODELS: monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] @@ -814,7 +816,7 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, monkeypatch): + image_assets: ImageTestAssets, monkeypatch): if model_type in REQUIRES_V0_MODELS: monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] @@ -840,7 +842,7 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, monkeypatch): + image_assets: ImageTestAssets, monkeypatch): if model_type in REQUIRES_V0_MODELS: monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] @@ -866,7 +868,8 @@ def test_image_embedding_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, monkeypatch): + image_assets: ImageTestAssets, + monkeypatch): if model_type in REQUIRES_V0_MODELS: monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] @@ -889,7 +892,7 @@ def test_image_embedding_models_heavy(model_type: str, def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - video_assets: _VideoAssets, monkeypatch): + video_assets: VideoTestAssets, monkeypatch): if model_type in REQUIRES_V0_MODELS: monkeypatch.setenv("VLLM_USE_V1", "0") model_test_info = VLM_TEST_SETTINGS[model_type] diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py index 14b64393bf52a..b8225f5f12437 100644 --- a/tests/models/multimodal/generation/test_florence2.py +++ b/tests/models/multimodal/generation/test_florence2.py @@ -9,7 +9,7 @@ from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt from vllm.multimodal.image import rescale_image_size from vllm.sequence import SampleLogprobs -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner from ...utils import check_logprobs_close MODELS = ["microsoft/Florence-2-base"] @@ -118,7 +118,7 @@ def run_test( @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, model: str, + image_assets: ImageTestAssets, model: str, size_factors: list[int], dtype: str, max_tokens: int, num_logprobs: int) -> None: images = [asset.pil_image for asset in image_assets] diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index 7c14845ec54d4..96c444441e3d2 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -9,7 +9,8 @@ from transformers import AutoModelForSpeechSeq2Seq from vllm.lora.request import LoRARequest from vllm.sequence import SampleLogprobs -from ....conftest import HfRunner, PromptAudioInput, VllmRunner, _AudioAssets +from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput, + VllmRunner) from ...registry import HF_EXAMPLE_MODELS from ...utils import check_logprobs_close @@ -116,9 +117,9 @@ def run_test( @pytest.mark.parametrize("max_model_len", [2048]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [10]) -def test_models(hf_runner, vllm_runner, model: str, audio_assets: _AudioAssets, - dtype: str, max_model_len: int, max_tokens: int, - num_logprobs: int) -> None: +def test_models(hf_runner, vllm_runner, model: str, + audio_assets: AudioTestAssets, dtype: str, max_model_len: int, + max_tokens: int, num_logprobs: int) -> None: model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py index 92c8155fe1e25..eec84751e4504 100644 --- a/tests/models/multimodal/generation/test_interleaved.py +++ b/tests/models/multimodal/generation/test_interleaved.py @@ -29,7 +29,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None: image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB") image_stop = ImageAsset("stop_sign").pil_image.convert("RGB") images = [image_cherry, image_stop] - video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays + video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays inputs = [ ( diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py index 1e09c8673dc32..99aa3c2d3bd99 100644 --- a/tests/models/multimodal/generation/test_mllama.py +++ b/tests/models/multimodal/generation/test_mllama.py @@ -14,8 +14,8 @@ from vllm.model_executor.models.mllama import MllamaForConditionalGeneration from vllm.multimodal.image import rescale_image_size from vllm.sequence import SampleLogprobs -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) +from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets, + PromptImageInput, VllmRunner) from ....quantization.utils import is_quant_method_supported from ....utils import (create_new_process_for_each_test, large_gpu_test, multi_gpu_test) @@ -90,7 +90,7 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str, def _get_inputs( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, *, size_factors: Optional[list[float]] = None, sizes: Optional[list[tuple[int, int]]] = None, @@ -126,7 +126,7 @@ def _get_inputs( def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model: str, *, size_factors: list[float], @@ -143,7 +143,7 @@ def run_test( def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model: str, *, sizes: list[tuple[int, int]], @@ -159,7 +159,7 @@ def run_test( def run_test( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model: str, *, size_factors: Optional[list[float]] = None, @@ -433,7 +433,7 @@ def test_models_distributed( @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), reason='bitsandbytes is not supported on this GPU type.') def test_bnb_regression( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model: str, dtype: str, max_tokens: int, @@ -473,7 +473,7 @@ def test_bnb_regression( @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [32]) def test_explicit_implicit_prompt( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model: str, dtype: str, max_tokens: int, diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index 0b27a4caf6eb7..6be401b775ec2 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -50,7 +50,7 @@ IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ }) VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ - "sample_demo_1": + "baby_reading": qwen2_vl_chat_template( VIDEO_PLACEHOLDER, "Describe this video with a short sentence ", diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py index 1d7de946a3f80..322d886a593d5 100644 --- a/tests/models/multimodal/generation/test_ultravox.py +++ b/tests/models/multimodal/generation/test_ultravox.py @@ -11,13 +11,22 @@ from transformers import AutoModel, AutoTokenizer from vllm.multimodal.audio import resample_audio_librosa from vllm.sequence import SampleLogprobs -from ....conftest import HfRunner, VllmRunner, _AudioAssets +from ....conftest import AUDIO_ASSETS, AudioTestAssets, HfRunner, VllmRunner from ....utils import RemoteOpenAIServer from ...registry import HF_EXAMPLE_MODELS from ...utils import check_logprobs_close MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b" +AUDIO_PROMPTS = AUDIO_ASSETS.prompts({ + "mary_had_lamb": + "Transcribe this into English.", + "winning_call": + "What is happening in this audio clip?", +}) + +MULTI_AUDIO_PROMPT = "Describe each of the audios above." + AudioTuple = tuple[np.ndarray, int] VLLM_PLACEHOLDER = "<|audio|>" @@ -31,12 +40,6 @@ CHUNKED_PREFILL_KWARGS = { } -@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call")) -def audio(request): - from vllm.assets.audio import AudioAsset - return AudioAsset(request.param) - - def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]: """Convert kwargs to CLI args.""" args = [] @@ -53,7 +56,7 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]: pytest.param({}, marks=pytest.mark.cpu_model), pytest.param(CHUNKED_PREFILL_KWARGS), ]) -def server(request, audio_assets: _AudioAssets): +def server(request, audio_assets: AudioTestAssets): args = [ "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager", "--limit-mm-per-prompt", @@ -199,15 +202,19 @@ def run_multi_audio_test( pytest.param({}, marks=pytest.mark.cpu_model), pytest.param(CHUNKED_PREFILL_KWARGS), ]) -def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, - num_logprobs: int, vllm_kwargs: dict) -> None: +def test_models(hf_runner, vllm_runner, audio_assets: AudioTestAssets, + dtype: str, max_tokens: int, num_logprobs: int, + vllm_kwargs: dict) -> None: + audio_inputs = [( + _get_prompt(1, audio, VLLM_PLACEHOLDER), + _get_prompt(1, audio, HF_PLACEHOLDER), + audio.audio_and_sample_rate, + ) for audio in audio_assets] - vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER) - hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER) run_test( hf_runner, vllm_runner, - [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)], + audio_inputs, MODEL_NAME, dtype=dtype, max_tokens=max_tokens, @@ -224,13 +231,12 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, pytest.param({}, marks=pytest.mark.cpu_model), pytest.param(CHUNKED_PREFILL_KWARGS), ]) -def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets, - dtype: str, max_tokens: int, - num_logprobs: int, +def test_models_with_multiple_audios(vllm_runner, + audio_assets: AudioTestAssets, dtype: str, + max_tokens: int, num_logprobs: int, vllm_kwargs: dict) -> None: - vllm_prompt = _get_prompt(len(audio_assets), - "Describe each of the audios above.", + vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER) run_multi_audio_test( vllm_runner, @@ -245,7 +251,7 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets, @pytest.mark.asyncio -async def test_online_serving(client, audio_assets: _AudioAssets): +async def test_online_serving(client, audio_assets: AudioTestAssets): """Exercises online serving with/without chunked prefill enabled.""" messages = [{ diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index bf5f87ebf9847..e3ba955a96a6b 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -11,7 +11,7 @@ from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import (rescale_video_size, resize_video, sample_frames_from_video) -from .....conftest import _ImageAssets, _VideoAssets +from .....conftest import ImageTestAssets, VideoTestAssets from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT, ImageSizeWrapper, SizeType, VLMTestInfo) @@ -69,7 +69,7 @@ def get_model_prompts(base_prompts: Iterable[str], def build_single_image_inputs_from_test_info( test_info: VLMTestInfo, - image_assets: _ImageAssets, + image_assets: ImageTestAssets, size_wrapper: ImageSizeWrapper, tmp_path: Optional[PosixPath] = None): if test_info.prompt_formatter is None: @@ -116,7 +116,7 @@ def build_single_image_inputs(images, model_prompts, def build_multi_image_inputs_from_test_info( test_info: VLMTestInfo, - image_assets: _ImageAssets, + image_assets: ImageTestAssets, size_wrapper: ImageSizeWrapper, tmp_path: Optional[PosixPath] = None): if test_info.prompt_formatter is None: @@ -159,7 +159,7 @@ def build_multi_image_inputs(image_lists, model_prompts, def build_embedding_inputs_from_test_info( test_info: VLMTestInfo, - image_assets: _ImageAssets, + image_assets: ImageTestAssets, size_wrapper: ImageSizeWrapper, ): # These conditions will always be true if invoked through filtering, @@ -192,7 +192,7 @@ def build_embedding_inputs_from_test_info( def build_video_inputs_from_test_info( test_info: VLMTestInfo, - video_assets: _VideoAssets, + video_assets: VideoTestAssets, size_wrapper: ImageSizeWrapper, num_frames: int, ): diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index c856fb198b329..f0f4ed989241f 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -16,7 +16,7 @@ from transformers import (AutoConfig, AutoTokenizer, BatchFeature, from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import patch_padding_side -from .....conftest import HfRunner, ImageAsset, _ImageAssets +from .....conftest import HfRunner, ImageAsset, ImageTestAssets from .types import RunnerOutput @@ -238,14 +238,14 @@ def minimax_vl_01_hf_output(hf_output: RunnerOutput, ####### Functions for converting image assets to embeddings -def get_llava_embeddings(image_assets: _ImageAssets): +def get_llava_embeddings(image_assets: ImageTestAssets): return [asset.image_embeds for asset in image_assets] ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( - tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], - _ImageAssets]) -> str: + tmp_path: PosixPath, prompt: str, + assets: Union[list[ImageAsset], ImageTestAssets]) -> str: """Given a temporary dir path, export one or more image assets into the tempdir & replace its contents with the local path to the string so that the HF version of Qwen-VL can resolve the path and load the image in its @@ -706,3 +706,11 @@ def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner: hf_model.processor = processor return hf_model + + +def qwen2_5_omni_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner for Qwen2.5-Omni.""" + thinker = hf_model.model.thinker + thinker.get_output_embeddings = lambda: thinker.lm_head + hf_model.model = thinker + return hf_model diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py index 023df5f161880..34753121ea902 100644 --- a/tests/models/multimodal/generation/vlm_utils/runners.py +++ b/tests/models/multimodal/generation/vlm_utils/runners.py @@ -4,7 +4,8 @@ types / modalities. """ from pathlib import PosixPath -from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets +from .....conftest import (HfRunner, ImageTestAssets, VideoTestAssets, + VllmRunner) from . import builders, core from .types import ExpandableVLMTestArgs, VLMTestInfo @@ -14,7 +15,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: ImageTestAssets): assert test_case.size_wrapper is not None inputs = builders.build_single_image_inputs_from_test_info( model_test_info, image_assets, test_case.size_wrapper, tmp_path) @@ -37,7 +38,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: ImageTestAssets): assert test_case.size_wrapper is not None inputs = builders.build_multi_image_inputs_from_test_info( model_test_info, image_assets, test_case.size_wrapper, tmp_path) @@ -60,7 +61,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo, test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - image_assets: _ImageAssets): + image_assets: ImageTestAssets): assert test_case.size_wrapper is not None inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info( model_test_info, image_assets, test_case.size_wrapper) @@ -86,7 +87,7 @@ def run_video_test( test_case: ExpandableVLMTestArgs, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - video_assets: _VideoAssets, + video_assets: VideoTestAssets, ): assert test_case.size_wrapper is not None assert test_case.num_video_frames is not None diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 1ae61ea472295..56629323394d1 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -15,7 +15,7 @@ from vllm.config import TaskOption from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import AnyTokenizer -from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets +from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, ImageTestAssets from ....utils import check_logprobs_close # meta image tag; will be replaced by the appropriate tag for the model @@ -85,7 +85,7 @@ class VLMTestInfo(NamedTuple): # Function for converting ImageAssets to image embeddings; # We need to define this explicitly for embedding tests - convert_assets_to_embeddings: Optional[Callable[[_ImageAssets], + convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets], torch.Tensor]] = None # Exposed options for vLLM runner; we change these in a several tests, @@ -141,7 +141,7 @@ class VLMTestInfo(NamedTuple): # for Qwen-VL, which requires encoding the image path / url into the prompt # for HF runner prompt_path_encoder: Optional[ - Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]], + Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]] = None # noqa: E501 # Allows configuring a test to run with custom inputs diff --git a/tests/models/multimodal/generation/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py similarity index 84% rename from tests/models/multimodal/generation/test_intern_vit.py rename to tests/models/multimodal/pooling/test_intern_vit.py index a842d14fee2e5..76f9fbe025505 100644 --- a/tests/models/multimodal/generation/test_intern_vit.py +++ b/tests/models/multimodal/pooling/test_intern_vit.py @@ -1,33 +1,34 @@ # SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - import pytest import torch import torch.nn as nn from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModel, CLIPImageProcessor -from ....conftest import _ImageAssets +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE + +from ....conftest import ImageTestAssets # we use snapshot_download to prevent conflicts between # dynamic_module and trust_remote_code for hf_runner DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] +@torch.inference_mode() def run_intern_vit_test( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model_id: str, *, dtype: str, - distributed_executor_backend: Optional[str] = None, ): model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN) + torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] img_processor = CLIPImageProcessor.from_pretrained(model) images = [asset.pil_image for asset in image_assets] pixel_values = [ - img_processor(images, return_tensors='pt').pixel_values.to(dtype) + img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype) for images in images ] @@ -36,14 +37,13 @@ def run_intern_vit_test( config.norm_type = "rms_norm" hf_model = AutoModel.from_pretrained(model, - torch_dtype=dtype, + torch_dtype=torch_dtype, trust_remote_code=True).to("cuda") hf_outputs_per_image = [ hf_model(pixel_value.to("cuda")).last_hidden_state for pixel_value in pixel_values ] - from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.models.intern_vit import InternVisionModel vllm_model = InternVisionModel(config) vllm_model.load_weights(hf_model.state_dict().items()) @@ -51,7 +51,7 @@ def run_intern_vit_test( del hf_model cleanup_dist_env_and_memory() - vllm_model = vllm_model.to("cuda", dtype) + vllm_model = vllm_model.to("cuda", torch_dtype) vllm_outputs_per_image = [ vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values @@ -69,8 +69,7 @@ def run_intern_vit_test( "OpenGVLab/InternViT-300M-448px", "OpenGVLab/InternViT-6B-448px-V1-5", ]) -@pytest.mark.parametrize("dtype", [torch.half]) -@torch.inference_mode() +@pytest.mark.parametrize("dtype", ["half"]) def test_models(dist_init, image_assets, model_id, dtype: str) -> None: run_intern_vit_test( image_assets, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 2b1d38dfda97a..772a2db3e48af 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -284,7 +284,7 @@ def _test_processing_correctness_mistral( "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", - "Qwen/Qwen2.5-Omni-7B", + "Qwen/Qwen2.5-Omni-3B", "Skywork/Skywork-R1V-38B", "fixie-ai/ultravox-v0_5-llama-3_2-1b", "openai/whisper-large-v3", diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 709a686577f34..37142b6dd36f1 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import rescale_image_size from vllm.multimodal.processing import BaseMultiModalProcessor -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -137,7 +137,7 @@ def _run_check( @pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( model_id: str, - image_assets: _ImageAssets, + image_assets: ImageTestAssets, size_factors: list[int], min_dynamic_patch: int, max_dynamic_patch: int, diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index f5b5cf6b5ba96..c35ce2f6ab291 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -5,7 +5,7 @@ from transformers import Idefics3Config from vllm.multimodal import MULTIMODAL_REGISTRY -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -21,7 +21,7 @@ from ...utils import build_model_context @pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model_id: str, mm_processor_kwargs: dict[str, object], expected_toks_per_img: int, diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index 5ac47ecc5cc17..7ec81197a3db6 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -11,7 +11,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import rescale_image_size from vllm.multimodal.processing import BaseMultiModalProcessor -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -94,7 +94,7 @@ def _run_check( @pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( model_id: str, - image_assets: _ImageAssets, + image_assets: ImageTestAssets, size_factors: list[int], min_dynamic_patch: int, max_dynamic_patch: int, diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 2bfc2785feb6f..614f17dbbeda7 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -6,7 +6,7 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.transformers_utils.tokenizer import encode_tokens -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -17,7 +17,7 @@ from ...utils import build_model_context @pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False]) @pytest.mark.parametrize("tokenized_prompt", [True, False]) def test_processor_override( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model_id: str, mm_processor_kwargs: dict, num_imgs: int, diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py index 10de28ab54ce9..9bd2b9887294f 100644 --- a/tests/models/multimodal/processing/test_minimax_vl_01.py +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -7,14 +7,14 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.parse import ImageSize from vllm.multimodal.processing import BaseMultiModalProcessor -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @pytest.mark.parametrize("model_id", ["MiniMaxAI/MiniMax-VL-01"]) @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_override( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model_id: str, num_imgs: int, ): diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index ed0d04c5c5f5d..b53351544c458 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -4,7 +4,7 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -22,7 +22,7 @@ from ...utils import build_model_context @pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model_id: str, mm_processor_kwargs: dict[str, int], expected_toks_per_img: int, diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py index 797986adba4af..c6e272650e08b 100644 --- a/tests/models/multimodal/processing/test_phi4mm.py +++ b/tests/models/multimodal/processing/test_phi4mm.py @@ -4,7 +4,7 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -22,7 +22,7 @@ from ...utils import build_model_context @pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model_id: str, mm_processor_kwargs: dict[str, int], expected_toks_per_img: int, diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index d8c2ca414d41c..02abe1ca8b024 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -4,7 +4,7 @@ import pytest from vllm.multimodal import MULTIMODAL_REGISTRY -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -19,7 +19,7 @@ from ...utils import build_model_context @pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model_id: str, mm_processor_kwargs: dict[str, object], expected_toks_per_img: int, diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py index 56edc58a71baa..224d1bcedb966 100644 --- a/tests/models/multimodal/processing/test_smolvlm.py +++ b/tests/models/multimodal/processing/test_smolvlm.py @@ -5,7 +5,7 @@ from transformers import SmolVLMConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from ....conftest import _ImageAssets +from ....conftest import ImageTestAssets from ...utils import build_model_context @@ -21,7 +21,7 @@ from ...utils import build_model_context @pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("kwargs_on_init", [True, False]) def test_processor_override( - image_assets: _ImageAssets, + image_assets: ImageTestAssets, model_id: str, mm_processor_kwargs: dict[str, object], expected_toks_per_img: int, diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index c02c3d90e345f..597c8e48fb64d 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -7,7 +7,7 @@ import torch from vllm.multimodal.image import rescale_image_size -from ...conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets +from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner from ..utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ @@ -20,7 +20,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ def run_awq_test( vllm_runner: type[VllmRunner], - image_assets: _ImageAssets, + image_assets: ImageTestAssets, source_model: str, quant_model: str, *, diff --git a/tests/models/registry.py b/tests/models/registry.py index a19c43b698f1a..cce2c82b3dc30 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -72,12 +72,15 @@ class _HfExamplesInfo: return current_version = TRANSFORMERS_VERSION + cur_base_version = Version(current_version).base_version min_version = self.min_transformers_version max_version = self.max_transformers_version msg = f"`transformers=={current_version}` installed, but `transformers" - if min_version and Version(current_version) < Version(min_version): + # Only check the base version for the min/max version, otherwise preview + # models cannot be run because `x.yy.0.dev0`<`x.yy.0` + if min_version and Version(cur_base_version) < Version(min_version): msg += f">={min_version}` is required to run this model." - elif max_version and Version(current_version) > Version(max_version): + elif max_version and Version(cur_base_version) > Version(max_version): msg += f"<={max_version}` is required to run this model." else: return @@ -362,8 +365,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501 - "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B", # noqa: E501 - min_transformers_version="4.52"), # noqa: E501 + "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B", + min_transformers_version="4.52"), "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 diff --git a/tests/multimodal/assets/image1.png b/tests/multimodal/assets/image1.png new file mode 100644 index 0000000000000..17c7d4cdffe91 Binary files /dev/null and b/tests/multimodal/assets/image1.png differ diff --git a/tests/multimodal/assets/image2.png b/tests/multimodal/assets/image2.png new file mode 100644 index 0000000000000..0f13ce5d983d1 Binary files /dev/null and b/tests/multimodal/assets/image2.png differ diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py new file mode 100644 index 0000000000000..17b36b36888d5 --- /dev/null +++ b/tests/multimodal/test_hasher.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +from pathlib import Path + +import numpy as np +import pytest +import torch +from PIL import Image, ImageDraw + +from vllm.multimodal.hasher import MultiModalHasher + +ASSETS_DIR = Path(__file__).parent / "assets" +assert ASSETS_DIR.exists() + + +# NOTE: Images that are the same visually are allowed to have the same hash +@pytest.mark.parametrize("mode_pair", [("1", "L"), ("RGBA", "CMYK")]) +def test_hash_collision_image_mode(mode_pair): + mode1, mode2 = mode_pair + image1 = Image.new(mode1, size=(10, 10), color=1) + image2 = Image.new(mode2, size=(10, 10), color=1) + + hasher = MultiModalHasher + assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2) + + +def test_hash_collision_image_palette(): + # These images differ only in Image.palette._palette + image1 = Image.open(ASSETS_DIR / "image1.png") + image2 = Image.open(ASSETS_DIR / "image2.png") + + hasher = MultiModalHasher + assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2) + + +def test_hash_collision_image_transpose(): + image1 = Image.new("1", size=(10, 20)) + ImageDraw.Draw(image1).line([(0, 0), (10, 0)]) + + image2 = Image.new("1", size=(20, 10)) + ImageDraw.Draw(image2).line([(0, 0), (0, 10)]) + + hasher = MultiModalHasher + assert hasher.hash_kwargs(image=image1) != hasher.hash_kwargs(image=image2) + + +def test_hash_collision_tensor_shape(): + # The hash should be different though the data is the same when flattened + arr1 = torch.zeros((5, 10, 20, 3)) + arr2 = torch.zeros((10, 20, 5, 3)) + + hasher = MultiModalHasher + assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2) + + +def test_hash_collision_array_shape(): + # The hash should be different though the data is the same when flattened + arr1 = np.zeros((5, 10, 20, 3)) + arr2 = np.zeros((10, 20, 5, 3)) + + hasher = MultiModalHasher + assert hasher.hash_kwargs(data=arr1) != hasher.hash_kwargs(data=arr2) diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index 314ec90e34f93..1a20228765e81 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -3,6 +3,7 @@ import importlib.metadata import importlib.util import pytest +import torch DTYPE = ["bfloat16"] @@ -21,5 +22,30 @@ def test_pre_quantized_model(vllm_runner): print(output) +@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") +@pytest.mark.parametrize( + "pt_load_map_location", + [ + "cuda:0", + # {"": "cuda"}, + ]) +def test_opt_125m_int4wo_model_loading_with_params(vllm_runner, + pt_load_map_location): + """ + Test loading roberta-base model with no lm_head. + """ + torch._dynamo.reset() + model_name = "jerryzh168/opt-125m-int4wo" + with vllm_runner(model_name=model_name, + quantization="torchao", + dtype="bfloat16", + pt_load_map_location=pt_load_map_location) as llm: + output = llm.generate_greedy(["The capital of France is"], + max_tokens=32) + + assert output + print(output) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/test_config.py b/tests/test_config.py index f2155d954db06..7db95e3f64502 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -5,7 +5,8 @@ from typing import Literal, Union import pytest -from vllm.config import ModelConfig, PoolerConfig, config, get_field +from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig, + config, get_field) from vllm.model_executor.layers.pooler import PoolingType from vllm.platforms import current_platform @@ -410,3 +411,16 @@ def test_generation_config_loading(): override_generation_config=override_generation_config) assert model_config.get_diff_sampling_param() == override_generation_config + + +@pytest.mark.parametrize("pt_load_map_location", [ + "cuda", + { + "": "cuda" + }, +]) +def test_load_config_pt_load_map_location(pt_load_map_location): + load_config = LoadConfig(pt_load_map_location=pt_load_map_location) + config = VllmConfig(load_config=load_config) + + assert config.load_config.pt_load_map_location == pt_load_map_location diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index ee4e95856f233..9987688b02fad 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1165,3 +1165,80 @@ def test_kv_connector_handles_preemption(): # All memory should be freed since nothing is running. assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \ == NUM_BLOCKS - 1 + + +def make_output(scheduler: Scheduler): + return ModelRunnerOutput( + req_ids=[req.request_id for req in scheduler.running], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(scheduler.running) + }, + sampled_token_ids=[[1000]] * len(scheduler.running), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + + +def assert_scheduler_empty(scheduler: Scheduler): + """Confirm the scheduler is "empty" - i.e. no leaks.""" + # Scheduler Metadata. + assert len(scheduler.requests) == 0 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 0 + assert len(scheduler.finished_req_ids) == 0 + assert len(scheduler._cached_reqs_data) == 0 + + # EncoderCacheManager. + assert len(scheduler.encoder_cache_manager.freed) == 0 + assert len(scheduler.encoder_cache_manager.cached) == 0 + + # KVCache Manager. + assert len(scheduler.kv_cache_manager.req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 + assert len(scheduler.kv_cache_manager.num_cached_block) == 0 + num_free_blocks = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + assert num_free_blocks == ( + scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1) + + # NOTE(rob): just the ref count on blocks will be 0. The hash + # value, etc will remain since we lazily evict for prefix cache. + for block in scheduler.kv_cache_manager.block_pool.blocks: + assert block.ref_cnt == 0 + # assert block._block_hash is None + # assert ( + # len(scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block + # ) == 0) + + +def test_memory_leak(): + """Test that we do not have a memory leak.""" + + scheduler = create_scheduler(enable_prefix_caching=True) + + NUM_REQUESTS = 5 + NUM_TOKENS = 10 + MAX_TOKENS = 10 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + + # Add each request. + for request in requests: + scheduler.add_request(request) + scheduler_output = scheduler.schedule() + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Iterate until done. + while True: + scheduler_output = scheduler.schedule() + if len(scheduler.running) == 0: + break + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Confirm no memory leak. + assert_scheduler_empty(scheduler) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index b8ba69b0dd8f3..ae4b536524be0 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -31,23 +31,38 @@ def test_deepseek_mla_attn_backend_module(): assert model_runner.attn_backend.__name__ == "TritonMLABackend" -@pytest.mark.parametrize("batch_size", list(range(1, 257))) -def test_prepare_prompt(batch_size): +@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) +@pytest.mark.parametrize("use_prompt_embeds", [True, False]) +def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch): + if use_prompt_embeds: + # Prompt Embeddings is only currently supported on V0 + monkeypatch.setenv("VLLM_USE_V1", "0") + model_runner = _create_model_runner( "facebook/opt-125m", max_num_batched_tokens=100000, max_num_seqs=100000, enable_chunked_prefill=False, + enable_prompt_embeds=True, ) seq_lens: list[int] = [] seq_group_metadata_list: list[SequenceGroupMetadata] = [] block_tables = {0: [1]} + expected_input_embeds_len = 0 for i in range(batch_size): # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData.from_seqs(range(seq_len)) + if use_prompt_embeds: + seq_data = SequenceData.from_seqs( + prompt_token_ids=[0] * seq_len, + prompt_embeds=torch.rand(seq_len, 10), + ) + expected_input_embeds_len += seq_len + else: + seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len)) + seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -68,6 +83,7 @@ def test_prepare_prompt(batch_size): seq_group_metadata_list) input_tokens = model_input.input_tokens input_positions = model_input.input_positions + input_embeds = model_input.inputs_embeds attn_metadata = model_input.attn_metadata return_seq_lens = model_input.seq_lens slot_mapping = attn_metadata.slot_mapping @@ -121,7 +137,11 @@ def test_prepare_prompt(batch_size): assert len(input_tokens) == sum(seq_lens) assert len(input_positions) == sum(seq_lens) - torch.testing.assert_close(input_tokens, input_positions) + if expected_input_embeds_len == 0: + torch.testing.assert_close(input_tokens, input_positions) + assert input_embeds is None + else: + assert len(input_embeds) == expected_input_embeds_len sampling_metadata = SamplingMetadata.prepare( seq_group_metadata_list, @@ -145,8 +165,13 @@ def test_prepare_prompt(batch_size): torch.testing.assert_close(actual, expected) -@pytest.mark.parametrize("batch_size", list(range(1, 257))) -def test_prepare_decode_cuda_graph(batch_size): +@pytest.mark.parametrize("batch_size", list(range(1, 257, 3))) +@pytest.mark.parametrize("use_prompt_embeds", [True, False]) +def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch): + if use_prompt_embeds: + # Prompt Embeddings is only currently supported on V0 + monkeypatch.setenv("VLLM_USE_V1", "0") + model_runner = _create_model_runner( "facebook/opt-125m", seed=0, @@ -155,6 +180,7 @@ def test_prepare_decode_cuda_graph(batch_size): max_num_batched_tokens=100000, max_num_seqs=100000, enable_chunked_prefill=False, + enable_prompt_embeds=True, ) context_lens: list[int] = [] @@ -164,10 +190,19 @@ def test_prepare_decode_cuda_graph(batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 context_lens.append(context_len) - seq_data = SequenceData.from_seqs(range(context_len)) + if use_prompt_embeds: + seq_data = SequenceData.from_seqs( + prompt_token_ids=[0] * context_len, + prompt_embeds=torch.rand(context_len, 10), + ) + output_embed = torch.rand(10) + else: + seq_data = SequenceData.from_seqs( + prompt_token_ids=range(context_len)) + output_embed = None seq_data.update_num_computed_tokens(context_len) # Append one token ID since prefill is finished. - seq_data.append_token_id(1, 0) + seq_data.append_token_id(1, 0, output_embed) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=False, @@ -180,9 +215,12 @@ def test_prepare_decode_cuda_graph(batch_size): model_input = model_runner._prepare_model_input_tensors( seq_group_metadata_list) - input_tokens, input_positions, attn_metadata, slot_mapping = ( - model_input.input_tokens, model_input.input_positions, - model_input.attn_metadata, model_input.attn_metadata.slot_mapping) + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + input_embeds = model_input.inputs_embeds + attn_metadata = model_input.attn_metadata + slot_mapping = attn_metadata.slot_mapping + assert len(slot_mapping) == len(input_tokens) expected_bs = model_runner.vllm_config.pad_for_cudagraph( @@ -227,7 +265,7 @@ def test_prepare_decode_cuda_graph(batch_size): # block table's first index corresponds to each batch, meaning in # decoding it is each token. assert attn_metadata.block_tables.shape[0] == len(input_tokens) - # Block table's second dim correspondsd to each token's block number. + # Block table's second dim corresponds to each token's block number. # It is padded up to assert attn_metadata.block_tables.shape[1] == ( model_runner.get_max_block_per_batch()) @@ -235,7 +273,12 @@ def test_prepare_decode_cuda_graph(batch_size): assert len(input_tokens) == expected_bs assert len(input_positions) == expected_bs - torch.allclose(input_tokens, input_positions) + if use_prompt_embeds: + expected_input_embeds_length = start_loc[-1] + assert len(input_embeds) == expected_input_embeds_length + assert expected_input_embeds_length <= expected_bs + else: + assert input_embeds is None # Verify Sampling expected_selected_token_indices = [] @@ -266,25 +309,27 @@ def test_empty_seq_group(): seq_group_metadata_list: list[SequenceGroupMetadata] = [] model_input = model_runner._prepare_model_input_tensors( seq_group_metadata_list) - input_tokens, input_positions, attn_metadata = ( - model_input.input_tokens, - model_input.input_positions, - model_input.attn_metadata, - ) + + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + attn_metadata = model_input.attn_metadata + assert input_tokens is None assert input_positions is None assert attn_metadata is None model_input = model_runner._prepare_model_input_tensors( seq_group_metadata_list) - (input_tokens, input_positions, attn_metadata, return_seq_lens) = ( - model_input.input_tokens, - model_input.input_positions, - model_input.attn_metadata, - model_input.seq_lens, - ) + + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + input_embeds = model_input.inputs_embeds + attn_metadata = model_input.attn_metadata + return_seq_lens = model_input.seq_lens + assert input_tokens is None assert input_positions is None + assert input_embeds is None assert attn_metadata is None assert return_seq_lens is None @@ -299,9 +344,15 @@ def distributed_init(): ensure_model_parallel_initialized(1, 1) -@pytest.mark.parametrize("batch_size", list(range(2, 128))) +@pytest.mark.parametrize("batch_size", list(range(2, 128, 3))) @pytest.mark.parametrize("enforce_eager", [True, False]) -def test_hybrid_batches(batch_size, enforce_eager, distributed_init): +@pytest.mark.parametrize('use_prompt_embeds', [True, False]) +def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds, + distributed_init, monkeypatch): + if use_prompt_embeds: + # Prompt Embeddings is only currently supported on V0 + monkeypatch.setenv("VLLM_USE_V1", "0") + model_runner = _create_model_runner( "facebook/opt-125m", seed=0, @@ -310,6 +361,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): max_num_batched_tokens=100000, max_num_seqs=100000, enable_chunked_prefill=True, + enable_prompt_embeds=True, ) # Add prefill requests. @@ -320,11 +372,20 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): block_tables = {0: [1]} prefill_batch_size = batch_size // 2 decode_batch_size = batch_size - prefill_batch_size + expected_input_embeds_len = 0 for i in range(prefill_batch_size): # make sure all tokens fit into one block seq_len = i % (model_runner.block_size - 1) + 1 seq_lens.append(seq_len) - seq_data = SequenceData.from_seqs(range(seq_len)) + if use_prompt_embeds: + seq_data = SequenceData.from_seqs( + prompt_token_ids=[0] * seq_len, + prompt_embeds=torch.rand(seq_len, 10), + ) + expected_input_embeds_len += seq_len + else: + seq_data = SequenceData.from_seqs( + prompt_token_ids=range(seq_len), ) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", is_prompt=True, @@ -340,8 +401,21 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): for i in range(prefill_batch_size, batch_size): # make sure all tokens fit into one block context_len = i % (model_runner.block_size - 1) + 1 - seq_data = SequenceData.from_seqs(range(context_len)) - seq_data.append_token_id(1, 0) + if use_prompt_embeds: + seq_data = SequenceData.from_seqs( + prompt_token_ids=[0] * context_len, + prompt_embeds=torch.rand(context_len, 10), + ) + output_embed = torch.rand(10) + # This also iterates the expected input_embeds, because the model + # needs both the input and output embeddings passed into together + expected_input_embeds_len += 1 + else: + seq_data = SequenceData.from_seqs( + prompt_token_ids=range(context_len), ) + output_embed = None + assert len(seq_data.prompt_token_ids) == context_len + seq_data.append_token_id(1, 0, output_embed) seq_data.update_num_computed_tokens(context_len) seq_group_metadata = SequenceGroupMetadata( request_id=f"test_{i}", @@ -355,11 +429,11 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): decode_metadata_list.append(seq_group_metadata) model_input = model_runner.prepare_model_input(seq_group_metadata_list) - (input_tokens, input_positions, attn_metadata) = ( - model_input.input_tokens, - model_input.input_positions, - model_input.attn_metadata, - ) + + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + input_embeds = model_input.inputs_embeds + attn_metadata = model_input.attn_metadata prefill_meta_actual = attn_metadata.prefill_metadata decode_meta_actual = attn_metadata.decode_metadata @@ -369,6 +443,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init): assert attn_metadata.num_prefills == prefill_batch_size assert attn_metadata.num_decode_tokens == decode_batch_size assert attn_metadata.num_prefill_tokens == sum(seq_lens) + if expected_input_embeds_len == 0: + assert input_embeds is None + else: + assert len(input_embeds) == expected_input_embeds_len # Verify attn metadata is consistent. We don't need to test individual # values here because they are tested above. diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 7bb01507ac2c8..64f4310151cd8 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -559,7 +559,6 @@ def cutlass_scaled_mm(a: torch.Tensor, scale_a.shape * [1, 128] == a.shape scale_b.shape * [128, 128] == b.shape """ - assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16) assert bias is None or bias.shape[0] == b.shape[ 1] and bias.dtype == out_dtype @@ -567,7 +566,8 @@ def cutlass_scaled_mm(a: torch.Tensor, m = a.shape[0] n = b.shape[1] - if current_platform.is_rocm(): + cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) + if current_platform.is_rocm() or not cutlass_compatible_b: triton_scaled_mm_module = importlib.import_module( "vllm.model_executor.layers.quantization.compressed_tensors." "triton_scaled_mm") diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index 0203dc092a71a..a21eb7f599faa 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -18,19 +18,25 @@ except ImportError: ASSET_DIR = "multimodal_asset" +AudioAssetName = Literal["winning_call", "mary_had_lamb"] + @dataclass(frozen=True) class AudioAsset: - name: Literal["winning_call", "mary_had_lamb"] + name: AudioAssetName + + @property + def filename(self) -> str: + return f"{self.name}.ogg" @property def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: - audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", + audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) return librosa.load(audio_path, sr=None) def get_local_path(self) -> Path: - return get_vllm_public_assets(filename=f"{self.name}.ogg", + return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) @property diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 2b1d258da9c78..d8cca9b74edd5 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -10,10 +10,12 @@ from .base import get_vllm_public_assets VLM_IMAGES_DIR = "vision_model_images" +ImageAssetName = Literal["stop_sign", "cherry_blossom"] + @dataclass(frozen=True) class ImageAsset: - name: Literal["stop_sign", "cherry_blossom"] + name: ImageAssetName @property def pil_image(self) -> Image.Image: diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 133e18b68e25b..bf06746a9ff66 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from functools import lru_cache -from typing import Literal, Optional +from typing import ClassVar, Literal, Optional import cv2 import numpy as np @@ -76,20 +76,31 @@ def video_to_pil_images_list(path: str, ] +VideoAssetName = Literal["baby_reading"] + + @dataclass(frozen=True) class VideoAsset: - name: Literal["sample_demo_1.mp4"] + name: VideoAssetName num_frames: int = -1 + _NAME_TO_FILE: ClassVar[dict[VideoAssetName, str]] = { + "baby_reading": "sample_demo_1.mp4", + } + + @property + def filename(self) -> str: + return self._NAME_TO_FILE[self.name] + @property def pil_images(self) -> list[Image.Image]: - video_path = download_video_asset(self.name) + video_path = download_video_asset(self.filename) ret = video_to_pil_images_list(video_path, self.num_frames) return ret @property def np_ndarrays(self) -> npt.NDArray: - video_path = download_video_asset(self.name) + video_path = download_video_asset(self.filename) ret = video_to_ndarrays(video_path, self.num_frames) return ret @@ -99,5 +110,5 @@ class VideoAsset: See also: examples/offline_inference/qwen2_5_omni/only_thinker.py """ - video_path = download_video_asset(self.name) + video_path = download_video_asset(self.filename) return librosa.load(video_path, sr=sampling_rate)[0] diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py index 528df2e98679c..4567893a9ef7c 100644 --- a/vllm/attention/backends/cpu_mla.py +++ b/vllm/attention/backends/cpu_mla.py @@ -281,8 +281,7 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): # remove padding output = output.view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]] - output = output.reshape(-1, self.num_heads * v.shape[-1]) - return self.o_proj(output)[0] + return output.reshape(-1, self.num_heads * v.shape[-1]) def _forward_decode( self, @@ -303,4 +302,4 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale, decode_meta.block_tables, decode_meta.seq_lens_tensor) - return self._v_up_proj_and_o_proj(o) + return self._v_up_proj(o) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index d92177d58a48f..37b20d0739f70 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -367,9 +367,17 @@ class FlashInferState(AttentionState): # scheduled while CUDA graph mode is enabled. We don't run graph in that # case. if use_cuda_graph and is_decode: - batch_size = model_input.input_tokens.shape[0] - state = (self.runner.graph_runners[model_input.virtual_engine] - [batch_size].attn_state) + if model_input.inputs_embeds is None: + batch_size = model_input.input_tokens.shape[0] + state = ( + self.runner.graph_runners[model_input.virtual_engine][( + batch_size, False)].attn_state) + else: + batch_size = model_input.inputs_embeds.shape[0] + state = ( + self.runner.graph_runners[model_input.virtual_engine][( + batch_size, True)].attn_state) + model_input.attn_metadata.prefill_wrapper = state._get_prefill_wrapper( ) model_input.attn_metadata.decode_wrapper = state._get_decode_wrapper() diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index 5d0c230933105..0e62748ddbee4 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -239,4 +239,4 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): causal=True, ) - return self._v_up_proj_and_o_proj(o) + return self._v_up_proj(o) diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 382a9a6d44d84..12d85b74244f4 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -207,7 +207,7 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearBase, RowParallelLinear, + LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, RotaryEmbedding) @@ -1032,12 +1032,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): qk_head_dim: int, v_head_dim: int, rotary_emb: RotaryEmbedding, - # q_proj should be q_b_proj if q_lora_rank is not None, but from an - # attention backend perspective we rely on the layer to pass in the - # correct matrix - q_proj: ColumnParallelLinear, kv_b_proj: ColumnParallelLinear, - o_proj: RowParallelLinear, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -1055,9 +1050,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): self.rotary_emb = rotary_emb self.use_yarn_rope = isinstance(rotary_emb, DeepseekScalingRotaryEmbedding) - self.q_proj = q_proj self.kv_b_proj = kv_b_proj - self.o_proj = o_proj self.triton_fa_func = triton_attention # Handle the differences between the flash_attn_varlen from flash_attn @@ -1141,27 +1134,13 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): return attn_out, rest[0] return attn_out - def _v_up_proj_and_o_proj(self, x): + def _v_up_proj(self, x): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) # Multiply (N, B, L) x (N, L, V) -> (N, B, V) x = torch.bmm(x, self.W_UV) # Convert from (N, B, V) to (B, N * V) - x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) - return self.o_proj(x)[0] - - # Return `ql_nope`, `q_pe` - def _q_proj_and_k_up_proj(self, x): - q_nope, q_pe = self.q_proj(x)[0]\ - .view(-1, self.num_heads, self.qk_head_dim)\ - .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - - # Convert from (B, N, P) to (N, B, P) - q_nope = q_nope.transpose(0, 1) - # Multiply (N, B, P) x (N, P, L) -> (N, B, L) - ql_nope = torch.bmm(q_nope, self.W_UK_T) - # Convert from (N, B, L) to (B, N, L) - return ql_nope.transpose(0, 1), q_pe + return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) def process_weights_after_loading(self, act_dtype: torch.dtype): @@ -1345,7 +1324,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): suffix_lse=suffix_lse, ) - return self.o_proj(output.flatten(start_dim=-2))[0] + return output.flatten(start_dim=-2) @abstractmethod def _forward_decode( @@ -1360,7 +1339,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): def forward( self, layer: AttentionLayer, - hidden_states_or_q_c: torch.Tensor, # query in unified attn + q: torch.Tensor, # query in unified attn k_c_normed: torch.Tensor, # key in unified attn k_pe: torch.Tensor, # value in unified attn kv_cache: torch.Tensor, @@ -1391,27 +1370,32 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): assert hasattr(attn_metadata, "input_positions") num_prefill_tokens: int = attn_metadata.num_prefill_tokens + q = q.view(-1, self.num_heads, self.qk_head_dim) - decode_hs_or_q_c = hidden_states_or_q_c[num_prefill_tokens:] + decode_q = q[num_prefill_tokens:] decode_k_pe = k_pe[num_prefill_tokens:] decode_input_positions = \ attn_metadata.input_positions[num_prefill_tokens:] - prefill_hs_or_q_c = hidden_states_or_q_c[:num_prefill_tokens] + prefill_q = q[:num_prefill_tokens] prefill_k_pe = k_pe[:num_prefill_tokens] prefill_input_positions = \ attn_metadata.input_positions[:num_prefill_tokens] prefill_k_c_normed = k_c_normed[:num_prefill_tokens] if has_decode: - decode_ql_nope, decode_q_pe = \ - self._q_proj_and_k_up_proj(decode_hs_or_q_c) + decode_q_nope, decode_q_pe = decode_q.split( + [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + # Convert from (B, N, P) to (N, B, P) + decode_q_nope = decode_q_nope.transpose(0, 1) + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + decode_ql_nope = decode_ql_nope.transpose(0, 1) decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( decode_input_positions, decode_q_pe, decode_k_pe) if has_prefill: - prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\ - .view(-1, self.num_heads, self.qk_head_dim) prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:] prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb( prefill_input_positions, prefill_q_pe, prefill_k_pe) @@ -1429,9 +1413,9 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): output = torch.empty(attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens, - self.o_proj.output_size, - device=hidden_states_or_q_c.device, - dtype=hidden_states_or_q_c.dtype) + self.v_head_dim * self.num_heads, + device=q.device, + dtype=q.dtype) if has_prefill: output[:num_prefill_tokens] = self._forward_prefill( prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache, diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index 6e695b78e0e15..2984bc1dad64a 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -409,4 +409,4 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): attn_metadata.paged_kv_indices, attn_metadata.paged_kv_last_page_lens) - return self._v_up_proj_and_o_proj(o) + return self._v_up_proj(o) diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index 61e5c76d9fda3..6945c2c6e29cd 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -110,4 +110,4 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): decode_meta.seq_lens_tensor, attn_logits, num_kv_splits, self.scale, PAGE_SIZE) - return self._v_up_proj_and_o_proj(o) + return self._v_up_proj(o) diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 1b47581641b06..759b3d8536dde 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -289,7 +289,7 @@ def chunked_prefill_paged_decode( max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) // _PARTITION_SIZE_ROCM) assert _PARTITION_SIZE_ROCM % block_size == 0 - total_num_seq = query.shape[0] + total_num_seq = block_table.shape[0] tmp_output = torch.empty( size=(total_num_seq, num_query_heads, max_num_partitions, head_size), diff --git a/vllm/config.py b/vllm/config.py index c2995cacaeb6c..91ef9dcdbd565 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -268,7 +268,7 @@ class ModelConfig: It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" rope_scaling: dict[str, Any] = field(default_factory=dict) - """RoPE scaling configuration in JSON format. For example, + """RoPE scaling configuration. For example, `{"rope_type":"dynamic","factor":2.0}`.""" rope_theta: Optional[float] = None """RoPE theta. Use with `rope_scaling`. In some cases, changing the RoPE @@ -321,6 +321,10 @@ class ModelConfig: """Skip initialization of tokenizer and detokenizer. Expects valid `prompt_token_ids` and `None` for prompt from the input. The generated output will contain token ids.""" + enable_prompt_embeds: bool = False + """If `True`, enables passing text embeddings as inputs via the + `prompt_embeds` key. Note that enabling this will double the time required + for graph compilation.""" served_model_name: Optional[Union[str, list[str]]] = None """The model name(s) used in the API. If multiple names are provided, the server will respond to any of the provided names. The model name in the @@ -346,14 +350,13 @@ class ModelConfig: (stored in `~/.huggingface`).""" hf_overrides: HfOverrides = field(default_factory=dict) """If a dictionary, contains arguments to be forwarded to the Hugging Face - config. If a callable, it is called to update the HuggingFace config. When - specified via CLI, the argument must be a valid JSON string.""" + config. If a callable, it is called to update the HuggingFace config.""" mm_processor_kwargs: Optional[dict[str, Any]] = None """Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. Overrides for the multi-modal processor obtained from `AutoProcessor.from_pretrained`. The available overrides depend on the model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`. - When specified via CLI, the argument must be a valid JSON string.""" + """ disable_mm_preprocessor_cache: bool = False """If `True`, disable caching of the multi-modal preprocessor/mapper (not recommended).""" @@ -361,15 +364,14 @@ class ModelConfig: """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to configure the neuron config that can not be gathered from the vllm - arguments. e.g. `{"cast_logits_dtype": "bloat16"}`. When specified via CLI, - the argument must be a valid JSON string.""" + arguments. e.g. `{"cast_logits_dtype": "bloat16"}`.""" pooler_config: Optional["PoolerConfig"] = field(init=False) """Pooler config which controls the behaviour of output pooling in pooling models.""" override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None """Initialize non-default pooling config or override default pooling config for the pooling model. e.g. `{"pooling_type": "mean", "normalize": false}`. - When specified via CLI, the argument must be a valid JSON string.""" + """ logits_processor_pattern: Optional[str] = None """Optional regex pattern specifying valid logits processor qualified names that can be passed with the `logits_processors` extra completion argument. @@ -385,8 +387,7 @@ class ModelConfig: """Overrides or sets generation config. e.g. `{"temperature": 0.5}`. If used with `--generation-config auto`, the override parameters will be merged with the default config from the model. If used with - `--generation-config vllm`, only the override parameters are used. - When specified via CLI, the argument must be a valid JSON string.""" + `--generation-config vllm`, only the override parameters are used.""" enable_sleep_mode: bool = False """Enable sleep mode for the engine (only cuda platform is supported).""" model_impl: Union[str, ModelImpl] = ModelImpl.AUTO.value @@ -1556,14 +1557,23 @@ class LoadConfig: cache directory of Hugging Face.""" model_loader_extra_config: dict = field(default_factory=dict) """Extra config for model loader. This will be passed to the model loader - corresponding to the chosen load_format. This should be a JSON string that - will be parsed into a dictionary.""" + corresponding to the chosen load_format.""" ignore_patterns: Optional[Union[list[str], str]] = None """The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's checkpoints.""" use_tqdm_on_load: bool = True """Whether to enable tqdm for showing progress bar when loading model weights.""" + pt_load_map_location: Union[str, dict[str, str]] = "cpu" + """ + pt_load_map_location: the map location for loading pytorch checkpoint, to + support loading checkpoints can only be loaded on certain devices like + "cuda", this is equivalent to {"": "cuda"}. Another supported format is + mapping from different devices like from GPU 1 to GPU 0: + {"cuda:1": "cuda:0"}. Note that when passed from command line, the strings + in dictionary needs to be double quoted for json parsing. For more details, + see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html + """ def compute_hash(self) -> str: """ @@ -2816,7 +2826,6 @@ class MultiModalConfig: "limit_mm_per_prompt") """ The maximum number of input items allowed per prompt for each modality. - This should be a JSON string that will be parsed into a dictionary. Defaults to 1 (V0) or 999 (V1) for each modality. For example, to allow up to 16 images and 2 videos per prompt: @@ -3135,6 +3144,14 @@ def _get_and_verify_max_len( # derived length from the HF model config. if max_model_len is None: max_model_len = int(derived_max_model_len) + if current_platform.is_tpu(): + logger.warning( + "--max-model-len is not specified, " + "it's currently using model's default length %s, " + "which might be too large." + "Please input with --max-model-len based on your " + "request input length and output length, to avoid " + "unnecessary degradation.", max_model_len) elif max_model_len > derived_max_model_len: # Some models might have a separate key for specifying model_max_length # that will be bigger than derived_max_model_len. We compare user input diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 97d03d5e3b40a..06d4ed470b209 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1071,6 +1071,7 @@ class Scheduler: ) ignored_seq_groups: List[SequenceGroup] = [] seq_groups: List[ScheduledSequenceGroup] = [] + using_prompt_embeds: bool = False waiting_queue = self.waiting @@ -1138,6 +1139,15 @@ class Scheduler: waiting_queue.popleft() continue + # We cannot mix sequence groups that use prompt embeds and + # those that do not. + if len(seq_groups) == 0: + using_prompt_embeds = seq_group.uses_prompt_embeds() + if using_prompt_embeds != seq_group.uses_prompt_embeds(): + leftover_waiting_sequences.appendleft(seq_group) + waiting_queue.popleft() + continue + lora_int_id = 0 if self.lora_enabled: lora_int_id = seq_group.lora_int_id @@ -1295,17 +1305,39 @@ class Scheduler: # Merge lists num_prefill_groups = len(prefills.seq_groups) + ignored_seq_groups_for_embeds = list[SequenceGroup]() if num_prefill_groups > 0: scheduled_seq_groups = prefills.seq_groups scheduled_seq_groups.extend(running_scheduled.decode_seq_groups) + ignored_seq_groups_for_embeds.clear() else: scheduled_seq_groups = running_scheduled.decode_seq_groups + if len(scheduled_seq_groups) > 0: + using_prompt_embeds = scheduled_seq_groups[ + 0].seq_group.uses_prompt_embeds() + ignored_seq_groups_for_embeds.clear() + indices_ignored = list[int]() + for i, schedule_seq_group in enumerate(scheduled_seq_groups): + if using_prompt_embeds !=\ + schedule_seq_group.seq_group.uses_prompt_embeds(): + ignored_seq_groups_for_embeds.append( + schedule_seq_group.seq_group) + indices_ignored.append(i) + if len(ignored_seq_groups_for_embeds) > 0: + scheduled_seq_groups = [ + group for i, group in enumerate(scheduled_seq_groups) + if i not in indices_ignored + ] + else: + ignored_seq_groups_for_embeds.clear() + scheduled_seq_groups.extend(swapped_in.decode_seq_groups) blocks_to_copy = running_scheduled.blocks_to_copy blocks_to_copy.extend(swapped_in.blocks_to_copy) ignored_seq_groups = prefills.ignored_seq_groups + ignored_seq_groups.extend(ignored_seq_groups_for_embeds) ignored_seq_groups.extend(swapped_in.infeasible_seq_groups) return SchedulerOutputs( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3cafcb7c31f21..f6f8fb69fb706 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -64,6 +64,13 @@ def optional_type( return _optional_type +def union_dict_and_str(val: str) -> Optional[Union[str, dict[str, str]]]: + if not re.match("^{.*}$", val): + return str(val) + else: + return optional_type(json.loads)(val) + + @deprecated( "Passing a JSON argument as a string containing comma separated key=value " "pairs is deprecated. This will be removed in v0.10.0. Please use a JSON " @@ -143,7 +150,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: # Get the help text for the field name = field.name - help = cls_docs[name] + help = cls_docs[name].strip() # Escape % for argparse help = help.replace("%", "%%") @@ -158,6 +165,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: type_hints.add(field.type) # Set other kwargs based on the type hints + json_tip = "\n\nShould be a valid JSON string." if contains_type(type_hints, bool): # Creates --no- and -- flags kwargs[name]["action"] = argparse.BooleanOptionalAction @@ -187,9 +195,14 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: kwargs[name]["type"] = human_readable_int elif contains_type(type_hints, float): kwargs[name]["type"] = float + elif contains_type(type_hints, + dict) and (contains_type(type_hints, str) or any( + is_not_builtin(th) for th in type_hints)): + kwargs[name]["type"] = union_dict_and_str elif contains_type(type_hints, dict): # Dict arguments will always be optional kwargs[name]["type"] = optional_type(json.loads) + kwargs[name]["help"] += json_tip elif (contains_type(type_hints, str) or any(is_not_builtin(th) for th in type_hints)): kwargs[name]["type"] = str @@ -221,6 +234,7 @@ class EngineArgs: hf_config_path: Optional[str] = ModelConfig.hf_config_path task: TaskOption = ModelConfig.task skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init + enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode trust_remote_code: bool = ModelConfig.trust_remote_code allowed_local_media_path: str = ModelConfig.allowed_local_media_path @@ -371,6 +385,7 @@ class EngineArgs: reasoning_parser: str = DecodingConfig.reasoning_backend use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load + pt_load_map_location: str = LoadConfig.pt_load_map_location def __post_init__(self): # support `EngineArgs(compilation_config={...})` @@ -431,6 +446,8 @@ class EngineArgs: **model_kwargs["disable_cascade_attn"]) model_group.add_argument("--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]) + model_group.add_argument("--enable-prompt-embeds", + **model_kwargs["enable_prompt_embeds"]) model_group.add_argument("--served-model-name", **model_kwargs["served_model_name"]) # This one is a special case because it is the @@ -491,6 +508,8 @@ class EngineArgs: type=str, default=None, help='Name or path of the QLoRA adapter.') + load_group.add_argument('--pt-load-map-location', + **load_kwargs["pt_load_map_location"]) # Guided decoding arguments guided_decoding_kwargs = get_kwargs(DecodingConfig) @@ -858,6 +877,7 @@ class EngineArgs: disable_sliding_window=self.disable_sliding_window, disable_cascade_attn=self.disable_cascade_attn, skip_tokenizer_init=self.skip_tokenizer_init, + enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, use_async_output_proc=not self.disable_async_output_proc, @@ -883,12 +903,14 @@ class EngineArgs: if self.quantization == "bitsandbytes": self.load_format = "bitsandbytes" + return LoadConfig( load_format=self.load_format, download_dir=self.download_dir, model_loader_extra_config=self.model_loader_extra_config, ignore_patterns=self.ignore_patterns, use_tqdm_on_load=self.use_tqdm_on_load, + pt_load_map_location=self.pt_load_map_location, ) def create_speculative_config( @@ -1423,8 +1445,8 @@ class EngineArgs: # as the platform that vLLM is running on (e.g. the case of scaling # vLLM with Ray) and has no GPUs. In this case we use the default # values for non-H100/H200 GPUs. + from vllm.platforms import current_platform try: - from vllm.platforms import current_platform device_memory = current_platform.get_device_total_memory() except Exception: # This is only used to set default_max_num_batched_tokens @@ -1445,11 +1467,37 @@ class EngineArgs: } default_max_num_seqs = 256 + # tpu specific default values. + if current_platform.is_tpu(): + default_max_num_batched_tokens_tpu = { + UsageContext.LLM_CLASS: { + 'V6E': 2048, + 'V5E': 1024, + 'V5P': 512, + }, + UsageContext.OPENAI_API_SERVER: { + 'V6E': 1024, + 'V5E': 512, + 'V5P': 256, + } + } + use_context_value = usage_context.value if usage_context else None if (self.max_num_batched_tokens is None and usage_context in default_max_num_batched_tokens): - self.max_num_batched_tokens = default_max_num_batched_tokens[ - usage_context] + if current_platform.is_tpu(): + chip_name = current_platform.get_device_name() + if chip_name in default_max_num_batched_tokens_tpu[ + usage_context]: + self.max_num_batched_tokens = \ + default_max_num_batched_tokens_tpu[ + usage_context][chip_name] + else: + self.max_num_batched_tokens = \ + default_max_num_batched_tokens[usage_context] + else: + self.max_num_batched_tokens = default_max_num_batched_tokens[ + usage_context] logger.debug( "Setting max_num_batched_tokens to %d for %s usage context.", self.max_num_batched_tokens, use_context_value) @@ -1513,7 +1561,7 @@ def _warn_or_fallback(feature_name: str) -> bool: def human_readable_int(value): """Parse human-readable integers like '1k', '2M', etc. Including decimal values with decimal multipliers. - + Examples: - '1k' -> 1,000 - '1K' -> 1,024 diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 6cc9b881464e9..50da9679d5aae 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -489,9 +489,13 @@ class _AsyncLLMEngine(LLMEngine): if arrival_time is None: arrival_time = time.time() - if self.tokenizer is not None: - tokenizer = await self.get_tokenizer_async(lora_request) - self._validate_token_prompt(prompt, tokenizer=tokenizer) + if (isinstance(prompt, dict) + and prompt.get("prompt_embeds", None) is not None + and not prompt.get("prompt_token_ids", None)): + # We use the -2 dimension (instead of 0) in case a batched input + # of batch size 1 is passed in. + prompt["prompt_token_ids"] = [0 + ] * prompt["prompt_embeds"].shape[-2] processed_inputs = await self.input_preprocessor.preprocess_async( prompt, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0930bae02e411..4398852daac98 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -30,7 +30,7 @@ from vllm.entrypoints.openai.logits_processors import ( get_logits_processors as get_openai_logits_processors) from vllm.executor.executor_base import ExecutorBase from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs -from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs +from vllm.inputs.parse import split_enc_dec_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.logits_process import get_bad_words_logits_processors @@ -753,10 +753,11 @@ class LLMEngine: if arrival_time is None: arrival_time = time.time() - if self.tokenizer is not None: - self._validate_token_prompt( - prompt, - tokenizer=self.get_tokenizer(lora_request=lora_request)) + if (isinstance(prompt, dict) + and prompt.get("prompt_embeds", None) is not None + and not prompt.get("prompt_token_ids", None)): + seq_len = prompt["prompt_embeds"].shape[0] + prompt["prompt_token_ids"] = [0] * seq_len processed_inputs = self.input_preprocessor.preprocess( prompt, @@ -776,27 +777,6 @@ class LLMEngine: priority=priority, ) - def _validate_token_prompt(self, prompt: PromptType, - tokenizer: AnyTokenizer): - # Guard against out-of-vocab tokens. - # For some tokenizers, tokenizer.decode will happily return empty text - # for token ids that are out of vocab, and we don't detect token ids - # that are greater than the max token id before running the model. - # However, these token ids will later crash a cuda kernel at runtime - # with an index out of bounds error. This will crash the entire engine. - # This needs to happen before multimodal input pre-processing, which - # may add dummy tokens that aren't part of the tokenizer's - # vocabulary. - if is_token_prompt(prompt): - prompt_ids = prompt["prompt_token_ids"] - if len(prompt_ids) == 0: - # Empty prompt check is handled later - return - max_input_id = max(prompt_ids) - if max_input_id > tokenizer.max_token_id: - raise ValueError( - "Token id {} is out of vocabulary".format(max_input_id)) - def _create_sequence_group_with_sampling( self, request_id: str, @@ -1267,11 +1247,13 @@ class LLMEngine: if self.scheduler_config.is_multi_step: is_prefill_append = seq.data.get_num_uncomputed_tokens( ) == 0 - seq.append_token_id(sample.output_token, sample.logprobs) + seq.append_token_id(sample.output_token, sample.logprobs, + sample.output_embed) if not is_prefill_append: seq_group.update_num_computed_tokens(1) else: - seq.append_token_id(sample.output_token, sample.logprobs) + seq.append_token_id(sample.output_token, sample.logprobs, + sample.output_embed) def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. @@ -2032,13 +2014,21 @@ class LLMEngine: tokenizer = (None if self.tokenizer is None else self.tokenizer.get_lora_tokenizer(lora_request)) - prompt_ids = prompt_inputs["prompt_token_ids"] + prompt_ids = prompt_inputs.get("prompt_token_ids", []) if not prompt_ids: if prompt_type == "encoder" and model_config.is_multimodal_model: pass # Mllama may have empty encoder inputs for text-only data + if prompt_inputs["type"] == "embeds": + pass else: raise ValueError(f"The {prompt_type} prompt cannot be empty") + if tokenizer is not None: + max_input_id = max(prompt_ids, default=0) + if max_input_id > tokenizer.max_token_id: + raise ValueError( + f"Token id {max_input_id} is out of vocabulary") + max_prompt_len = self.model_config.max_model_len if len(prompt_ids) > max_prompt_len: if prompt_type == "encoder" and model_config.is_multimodal_model: diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 126e7da702160..0f4c7517ebac8 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -167,6 +167,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): sampling_params: SamplingParams) -> None: output_token_ids = [sample.output_token for sample in valid_samples] output_logprobs = [sample.logprobs for sample in valid_samples] + output_embeds = [sample.output_embed for sample in valid_samples] # Truncate to max_tokens if necessary. remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() + @@ -190,11 +191,12 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): is_prefill_sampled_token = seq.data.get_num_uncomputed_tokens() == 0 # Incrementally append tokens to the sequence, as if we had only one new # token. - for output_token_id, output_logprob in zip(output_token_ids, - output_logprobs): + for output_token_id, output_logprob, output_embed in zip( + output_token_ids, output_logprobs, output_embeds): seq.append_token_id( token_id=output_token_id, logprobs=output_logprob, + token_embed=output_embed, ) if is_prefill_sampled_token: diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 4d96791a1f8a3..b5b51bb25a862 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -119,7 +119,8 @@ class SingleStepOutputProcessor(SequenceGroupOutputProcessor): sample = outputs.samples[0] seq = seq_group.first_seq if not is_async: - seq.append_token_id(sample.output_token, sample.logprobs) + seq.append_token_id(sample.output_token, sample.logprobs, + sample.output_embed) if sampling_params.detokenize and self.detokenizer: new_char_count = self.detokenizer.decode_sequence_inplace( seq, sampling_params) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 5632e8ad446df..e9350612ee57f 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -83,6 +83,9 @@ class EngineClient(ABC): else: processed_inputs = preprocessor._prompt_to_llm_inputs(prompt) + if processed_inputs["type"] == "embeds": + raise NotImplementedError + prompt_token_ids = processed_inputs["prompt_token_ids"] prompt_text = processed_inputs.get("prompt") multi_modal_data = processed_inputs.get("multi_modal_data") diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0a302872d2633..69523f36ffc41 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -27,7 +27,7 @@ from vllm.entrypoints.score_utils import (_cosine_similarity, _validate_score_input_lens) from vllm.entrypoints.utils import _validate_truncation_size from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt -from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt +from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding.guided_fields import ( @@ -567,10 +567,12 @@ class LLM: mm_kwargs["mm_processor_kwargs"] = prompt[ "mm_processor_kwargs"] - if is_token_prompt(prompt): + if "prompt_token_ids" in prompt: + prompt = cast(TokensPrompt, prompt) # Needed for mypy prompt_tokens = prompt["prompt_token_ids"] else: prompt_tokens = tokenizer.encode(prompt["prompt"]) + instances.append( BeamSearchInstance(prompt_tokens, logprobs=None, **mm_kwargs)) diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index ca706e202836d..9914a9dcffcc0 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 -from .data import (DecoderOnlyInputs, EncoderDecoderInputs, +from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs, - TokensPrompt, build_explicit_enc_dec_prompt, + TokensPrompt, build_explicit_enc_dec_prompt, embeds_inputs, to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) from .registry import (DummyData, InputContext, InputProcessingContext, InputRegistry) @@ -21,7 +21,9 @@ __all__ = [ "SingletonPrompt", "ExplicitEncoderDecoderPrompt", "TokenInputs", + "EmbedsInputs", "token_inputs", + "embeds_inputs", "DecoderOnlyInputs", "EncoderDecoderInputs", "ProcessorInputs", diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 167189ed108eb..86dbca1804126 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -2,6 +2,7 @@ from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast +import torch from typing_extensions import NotRequired, TypedDict, TypeVar if TYPE_CHECKING: @@ -63,12 +64,25 @@ class TokensPrompt(TypedDict): """ -SingletonPrompt = Union[str, TextPrompt, TokensPrompt] +class EmbedsPrompt(TypedDict): + """Schema for a prompt provided via token embeddings.""" + + prompt_embeds: torch.Tensor + """The embeddings of the prompt.""" + + cache_salt: NotRequired[str] + """ + Optional cache salt to be used for prefix caching. + """ + + +SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt] """ Set of possible schemas for a single prompt: - A text prompt (:class:`str` or :class:`TextPrompt`) - A tokenized prompt (:class:`TokensPrompt`) +- An embeddings prompt (:class:`EmbedsPrompt`) Note that "singleton" is as opposed to a data structure which encapsulates multiple prompts, i.e. of the sort @@ -129,6 +143,7 @@ both decoder-only and encoder/decoder input types: - A text prompt (:class:`str` or :class:`TextPrompt`) - A tokenized prompt (:class:`TokensPrompt`) +- An embeddings prompt (:class:`EmbedsPrompt`) - A single data structure containing both an encoder and a decoder prompt (:class:`ExplicitEncoderDecoderPrompt`) """ @@ -176,7 +191,35 @@ def token_inputs( return inputs -DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"] +class EmbedsInputs(TypedDict): + """Represents embeddings-based inputs.""" + + type: Literal["embeds"] + """The type of inputs.""" + + prompt_embeds: torch.Tensor + """The embeddings of the prompt.""" + + cache_salt: NotRequired[str] + """ + Optional cache salt to be used for prefix caching. + """ + + +def embeds_inputs( + prompt_embeds: torch.Tensor, + cache_salt: Optional[str] = None, +) -> EmbedsInputs: + """Construct :class:`EmbedsInputs` from optional values.""" + inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds) + + if cache_salt is not None: + inputs["cache_salt"] = cache_salt + + return inputs + + +DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -198,7 +241,7 @@ class EncoderDecoderInputs(TypedDict): """The inputs for the decoder portion.""" -SingletonInputs = Union[TokenInputs, "MultiModalInputs"] +SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"] """ A processed :class:`SingletonPrompt` which can be passed to :class:`vllm.sequence.Sequence`. diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 28e207de1fd39..d17122b483446 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -6,8 +6,9 @@ from typing_extensions import TypeIs from vllm.utils import is_list_of -from .data import (ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, - SingletonInputs, SingletonPrompt, TextPrompt, TokensPrompt) +from .data import (EmbedsPrompt, ExplicitEncoderDecoderPrompt, ProcessorInputs, + PromptType, SingletonInputs, SingletonPrompt, TextPrompt, + TokensPrompt) class ParsedText(TypedDict): @@ -84,23 +85,51 @@ class ParsedTokensPrompt(TypedDict): content: TokensPrompt -def parse_singleton_prompt( - prompt: SingletonPrompt, -) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]: +class ParsedEmbedsPrompt(TypedDict): + type: Literal['embeds'] + content: EmbedsPrompt + + +ParsedSingletonPrompt = Union[ParsedStrPrompt, ParsedTextPrompt, + ParsedTokensPrompt, ParsedEmbedsPrompt] + + +@overload +def parse_singleton_prompt(prompt: str) -> ParsedStrPrompt: + ... + + +@overload +def parse_singleton_prompt(prompt: TextPrompt) -> ParsedTextPrompt: + ... + + +@overload +def parse_singleton_prompt(prompt: TokensPrompt) -> ParsedTokensPrompt: + ... + + +@overload +def parse_singleton_prompt(prompt: EmbedsPrompt) -> ParsedEmbedsPrompt: + ... + + +def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt: if isinstance(prompt, str): return ParsedStrPrompt(type="str", content=prompt) elif isinstance(prompt, dict): - if "prompt_token_ids" in prompt: - return ParsedTokensPrompt(type="tokens", - content=prompt) # type: ignore + # Type ignores are because mypy does not correctly infer the TypedDicts + # Pyright does succeed. + if "prompt_embeds" in prompt: + return ParsedEmbedsPrompt( + type="embeds", content=prompt) # type: ignore[typeddict-item] + elif "prompt_token_ids" in prompt: + return ParsedTokensPrompt( + type="tokens", content=prompt) # type: ignore[typeddict-item] elif "prompt" in prompt: return ParsedTextPrompt(type="text", content=prompt) - - raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt") - - -def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]: - return isinstance(prompt, dict) and "prompt_token_ids" in prompt + raise TypeError( + "inputs must be a string, TextPrompt, TokensPrompt, or EmbedsPrompt") def is_explicit_encoder_decoder_prompt( diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 83e6907f8c496..53e0a477a12df 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,6 +6,7 @@ from typing import Any, Optional, Union, cast from typing_extensions import assert_never +from vllm import envs from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -13,12 +14,14 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs, MultiModalInputs) from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import TokenizerGroup -from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, - PromptType, SingletonInputs, SingletonPrompt, token_inputs) -from .parse import (ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt, - is_explicit_encoder_decoder_prompt, parse_singleton_prompt) +from .data import (DecoderOnlyInputs, EmbedsInputs, EmbedsPrompt, + EncoderDecoderInputs, ProcessorInputs, PromptType, + SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs, + TokensPrompt, embeds_inputs, token_inputs) +from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt logger = init_logger(__name__) @@ -137,13 +140,10 @@ class InputPreprocessor: """ Prepares `decoder_input_ids` for generation with encoder-decoder models. - Based on - - https://github.com/huggingface/transformers/blob/ - 4037a2b5b1278736e566aec12e169100275545ea/ - src/transformers/generation/utils.py - - specifically GenerationMixin._prepare_decoder_input_ids_for_generation() + Based on: + https://github.com/huggingface/transformers/blob/4037a2b5b1278736e566aec12e169100275545ea/src/transformers/generation/utils.py + specifically, + `GenerationMixin._prepare_decoder_input_ids_for_generation()`. Arguments: @@ -180,6 +180,23 @@ class InputPreprocessor: return prompt_token_ids + def _get_tokenization_kw( + self, + overrides: Optional[dict[str, Any]] = None, + ) -> dict[str, Any]: + kwargs = dict[str, Any]() + + if self.model_config.hf_config.model_type == "whisper": + # For Whisper, special tokens should be provided by the user based + # on the task and language of their request. Also needed to avoid + # appending an EOS token to the prompt which disrupts generation. + kwargs["add_special_tokens"] = False + + if overrides: + kwargs.update(overrides) + + return kwargs + def _tokenize_prompt( self, prompt: str, @@ -191,18 +208,11 @@ class InputPreprocessor: corresponding token IDs. """ tokenizer = self.get_tokenizer_group() - if tokenization_kwargs is None: - tokenization_kwargs = {} + tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) - if self.model_config.hf_config.model_type == "whisper": - # For Whisper, special tokens should be provided by the user based - # on the task and language of their request. Also needed to avoid - # appending an EOS token to the prompt which disrupts generation. - tokenization_kwargs["add_special_tokens"] = False + encoder_config = self.model_config.encoder_config - if (self.model_config.encoder_config is not None - and self.model_config.encoder_config.get( - "do_lower_case", False)): + if encoder_config and encoder_config.get("do_lower_case", False): prompt = prompt.lower() return tokenizer.encode(prompt=prompt, @@ -217,18 +227,36 @@ class InputPreprocessor: ) -> list[int]: """Async version of :meth:`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group() - if tokenization_kwargs is None: - tokenization_kwargs = {} + tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs) - if self.model_config.hf_config.model_type == "whisper": - # For Whisper, special tokens should be provided by the user based - # on the task and language of their request. Also needed to avoid - # appending an EOS token to the prompt which disrupts generation. - tokenization_kwargs["add_special_tokens"] = False return await tokenizer.encode_async(prompt=prompt, lora_request=lora_request, **tokenization_kwargs) + def _get_mm_tokenizer( + self, + lora_request: Optional[LoRARequest], + ) -> AnyTokenizer: + # PrithviGeoSpatialMAE needs to be initialized without a tokenizer + # while using also multi-modal input + if not self.tokenizer: + return cast(AnyTokenizer, object()) # Dummy + + tokenizer_group = self.get_tokenizer_group() + return tokenizer_group.get_lora_tokenizer(lora_request) + + async def _get_mm_tokenizer_async( + self, + lora_request: Optional[LoRARequest], + ) -> AnyTokenizer: + # PrithviGeoSpatialMAE needs to be initialized without a tokenizer + # while using also multi-modal input + if not self.tokenizer: + return cast(AnyTokenizer, object()) # Dummy + + tokenizer_group = self.get_tokenizer_group() + return await tokenizer_group.get_lora_tokenizer_async(lora_request) + def _process_multimodal( self, prompt: Union[str, list[int]], @@ -241,13 +269,7 @@ class InputPreprocessor: Apply the model's multi-modal processor to a multi-modal prompt, returning the corresponding token IDs and metadata. """ - # At the moment on model (PrithviGeoSpatialMAE) requires to be - # initialized without a tokenizer while using also multi-modal input - if not self.tokenizer: - tokenizer = object() # Dummy - else: - tokenizer_group = self.get_tokenizer_group() - tokenizer = tokenizer_group.get_lora_tokenizer(lora_request) + tokenizer = self._get_mm_tokenizer(lora_request) mm_processor = self.mm_registry.create_processor(self.model_config, tokenizer=tokenizer) @@ -267,14 +289,7 @@ class InputPreprocessor: return_mm_hashes: bool = False, ) -> MultiModalInputs: """Async version of :meth:`_process_multimodal`.""" - # At the moment on model (PrithviGeoSpatialMAE) requires to be - # initialized without a tokenizer while using also multi-modal input - if not self.tokenizer: - tokenizer = object() # Dummy - else: - tokenizer_group = self.get_tokenizer_group() - tokenizer = await tokenizer_group.get_lora_tokenizer_async( - lora_request) + tokenizer = await self._get_mm_tokenizer_async(lora_request) mm_processor = self.mm_registry.create_processor(self.model_config, tokenizer=tokenizer) @@ -284,28 +299,163 @@ class InputPreprocessor: return mm_processor.apply(prompt, mm_data, mm_processor_kwargs, return_mm_hashes) - def _get_prompt_data(self, parsed_prompt: Union[ParsedStrPrompt, - ParsedTextPrompt, - ParsedTokensPrompt]): - prompt_text = None - prompt_token_ids = None - token_type_ids = None - cache_salt = None + def _process_embeds( + self, + parsed_content: EmbedsPrompt, + ) -> EmbedsInputs: + if not self.model_config.enable_prompt_embeds: + raise ValueError("You must set `--enable-prompt-embeds` to input " + "`prompt_embeds`.") + if envs.VLLM_USE_V1: + raise ValueError("`prompt_embeds` is only available in V0.") - if parsed_prompt["type"] == "str": - prompt_text = parsed_prompt["content"] + prompt_embeds = parsed_content["prompt_embeds"] + + # prompt_embeds must be (seq_len, hidden_size), but if the user + # passes in a batch of size 1, i.e. (1, seq_len, hidden_size), + # we can unambiguously process the intent by squeezing the batch + # dimension. + if prompt_embeds.ndim == 3: + prompt_embeds = prompt_embeds.squeeze(dim=0) + + if prompt_embeds.ndim != 2: + raise ValueError( + "prompt_embeds must be of shape (seq_len, hidden_size).") + + return embeds_inputs(prompt_embeds=prompt_embeds, + cache_salt=parsed_content.get("cache_salt")) + + async def _process_embeds_async( + self, + parsed_content: EmbedsPrompt, + ) -> EmbedsInputs: + return self._process_embeds(parsed_content) + + def _process_tokens( + self, + parsed_content: TokensPrompt, + lora_request: Optional[LoRARequest] = None, + return_mm_hashes: bool = False, + ) -> Union[TokenInputs, MultiModalInputs]: + prompt_token_ids = parsed_content["prompt_token_ids"] + token_type_ids = parsed_content.get("token_type_ids") + + inputs: Union[TokenInputs, MultiModalInputs] + if multi_modal_data := parsed_content.get("multi_modal_data"): + inputs = self._process_multimodal( + prompt_token_ids, + multi_modal_data, + parsed_content.get("mm_processor_kwargs"), + lora_request=lora_request, + return_mm_hashes=return_mm_hashes, + ) else: - cache_salt = parsed_prompt["content"].get("cache_salt") - if parsed_prompt["type"] == "text": - prompt_text = parsed_prompt["content"]["prompt"] - elif parsed_prompt["type"] == "tokens": - prompt_token_ids = parsed_prompt["content"].get( - "prompt_token_ids") - token_type_ids = parsed_prompt["content"].get("token_type_ids") - else: - assert_never(parsed_prompt) + inputs = token_inputs( + prompt_token_ids=prompt_token_ids, + token_type_ids=token_type_ids, + ) - return prompt_text, prompt_token_ids, token_type_ids, cache_salt + if cache_salt := parsed_content.get("cache_salt"): + inputs["cache_salt"] = cache_salt + + return inputs + + async def _process_tokens_async( + self, + parsed_content: TokensPrompt, + lora_request: Optional[LoRARequest] = None, + return_mm_hashes: bool = False, + ) -> Union[TokenInputs, MultiModalInputs]: + prompt_token_ids = parsed_content["prompt_token_ids"] + token_type_ids = parsed_content.get("token_type_ids") + + inputs: Union[TokenInputs, MultiModalInputs] + if multi_modal_data := parsed_content.get("multi_modal_data"): + inputs = await self._process_multimodal_async( + prompt_token_ids, + multi_modal_data, + parsed_content.get("mm_processor_kwargs"), + lora_request=lora_request, + return_mm_hashes=return_mm_hashes, + ) + else: + inputs = token_inputs( + prompt_token_ids=prompt_token_ids, + token_type_ids=token_type_ids, + ) + + if cache_salt := parsed_content.get("cache_salt"): + inputs["cache_salt"] = cache_salt + + return inputs + + def _process_text( + self, + parsed_content: TextPrompt, + tokenization_kwargs: Optional[dict[str, Any]] = None, + lora_request: Optional[LoRARequest] = None, + return_mm_hashes: bool = False, + ) -> Union[TokenInputs, MultiModalInputs]: + prompt_text = parsed_content["prompt"] + + inputs: Union[TokenInputs, MultiModalInputs] + if multi_modal_data := parsed_content.get("multi_modal_data"): + inputs = self._process_multimodal( + prompt_text, + multi_modal_data, + parsed_content.get("mm_processor_kwargs"), + lora_request=lora_request, + return_mm_hashes=return_mm_hashes, + ) + else: + prompt_token_ids = self._tokenize_prompt( + prompt_text, + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, + ) + inputs = token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if cache_salt := parsed_content.get("cache_salt"): + inputs["cache_salt"] = cache_salt + + return inputs + + async def _process_text_async( + self, + parsed_content: TextPrompt, + tokenization_kwargs: Optional[dict[str, Any]] = None, + lora_request: Optional[LoRARequest] = None, + return_mm_hashes: bool = False, + ) -> Union[TokenInputs, MultiModalInputs]: + prompt_text = parsed_content["prompt"] + + inputs: Union[TokenInputs, MultiModalInputs] + if multi_modal_data := parsed_content.get("multi_modal_data"): + inputs = await self._process_multimodal_async( + prompt_text, + multi_modal_data, + parsed_content.get("mm_processor_kwargs"), + lora_request=lora_request, + return_mm_hashes=return_mm_hashes, + ) + else: + prompt_token_ids = await self._tokenize_prompt_async( + prompt_text, + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, + ) + inputs = token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if cache_salt := parsed_content.get("cache_salt"): + inputs["cache_salt"] = cache_salt + + return inputs def _prompt_to_llm_inputs( self, @@ -328,36 +478,31 @@ class InputPreprocessor: * :class:`SingletonInputs` instance """ parsed = parse_singleton_prompt(prompt) - prompt_text, prompt_token_ids, token_type_ids, cache_salt = \ - self._get_prompt_data(parsed) - # If multimodal data is present, process and return immediately - if parsed["type"] != "str" and parsed["content"].get( - "multi_modal_data") is not None: - inputs = self._process_multimodal( - prompt_text if prompt_text is not None else prompt_token_ids, - parsed["content"]["multi_modal_data"], - parsed["content"].get("mm_processor_kwargs"), + if parsed["type"] == "embeds": + return self._process_embeds(parsed["content"]) + if parsed["type"] == "tokens": + return self._process_tokens( + parsed["content"], lora_request=lora_request, return_mm_hashes=return_mm_hashes, ) - if cache_salt is not None: - inputs["cache_salt"] = cache_salt - return inputs - - if prompt_token_ids is None: - prompt_token_ids = self._tokenize_prompt( - prompt_text, - lora_request=lora_request, + if parsed["type"] == "text": + return self._process_text( + parsed["content"], tokenization_kwargs=tokenization_kwargs, + lora_request=lora_request, + return_mm_hashes=return_mm_hashes, + ) + if parsed["type"] == "str": + return self._process_text( + TextPrompt(prompt=parsed["content"]), + tokenization_kwargs=tokenization_kwargs, + lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) - return token_inputs( - prompt=prompt_text, - prompt_token_ids=prompt_token_ids, - token_type_ids=token_type_ids, - cache_salt=cache_salt, - ) + assert_never(parsed) async def _prompt_to_llm_inputs_async( self, @@ -366,49 +511,49 @@ class InputPreprocessor: lora_request: Optional[LoRARequest] = None, return_mm_hashes: bool = False, ) -> SingletonInputs: - """Async version of :meth:`_extract_prompt_components`.""" + """Async version of :meth:`_prompt_to_llm_inputs`.""" parsed = parse_singleton_prompt(prompt) - prompt_text, prompt_token_ids, token_type_ids, cache_salt = \ - self._get_prompt_data(parsed) - - if parsed["type"] != "str" and parsed["content"].get( - "multi_modal_data") is not None: - inputs = await self._process_multimodal_async( - prompt_token_ids if prompt_text is None else prompt_text, - parsed["content"]["multi_modal_data"], - parsed["content"].get("mm_processor_kwargs"), + if parsed["type"] == "embeds": + return await self._process_embeds_async(parsed["content"]) + if parsed["type"] == "tokens": + return await self._process_tokens_async( + parsed["content"], lora_request=lora_request, return_mm_hashes=return_mm_hashes, ) - if cache_salt is not None: - inputs["cache_salt"] = cache_salt - return inputs - - if prompt_token_ids is None: - prompt_token_ids = await self._tokenize_prompt_async( - prompt_text, - lora_request=lora_request, + if parsed["type"] == "text": + return await self._process_text_async( + parsed["content"], tokenization_kwargs=tokenization_kwargs, + lora_request=lora_request, + return_mm_hashes=return_mm_hashes, + ) + if parsed["type"] == "str": + return await self._process_text_async( + TextPrompt(prompt=parsed["content"]), + tokenization_kwargs=tokenization_kwargs, + lora_request=lora_request, + return_mm_hashes=return_mm_hashes, ) - return token_inputs( - prompt=prompt_text, - prompt_token_ids=prompt_token_ids, - token_type_ids=token_type_ids, - cache_salt=cache_salt, - ) + assert_never(parsed) def _build_enc_dec_llm_inputs( self, encoder_inputs: SingletonInputs, decoder_inputs: Optional[SingletonInputs], ) -> EncoderDecoderInputs: - if (encoder_inputs["type"] == "token" - or encoder_inputs["type"] == "multimodal"): - pass - else: - assert_never(encoder_inputs) # type: ignore[arg-type] + if (encoder_inputs["type"] == "embeds" + or decoder_inputs and decoder_inputs["type"] == "embeds"): + raise ValueError("Embedding inputs are not supported for encoder-" + "decoder models") + + # Needed for mypy + encoder_inputs = cast(Union[TokenInputs, MultiModalInputs], + encoder_inputs) + decoder_inputs = cast(Optional[Union[TokenInputs, MultiModalInputs]], + decoder_inputs) if decoder_inputs is None: if self.model_config.hf_config.model_type == "whisper": @@ -421,73 +566,78 @@ class InputPreprocessor: dec_token_ids = self._prepare_decoder_input_ids_for_generation( None) decoder_inputs = token_inputs(dec_token_ids) - elif (decoder_inputs["type"] == "token" - or decoder_inputs["type"] == "multimodal"): - dec_token_ids = self._prepare_decoder_input_ids_for_generation( - decoder_inputs["prompt_token_ids"]) - decoder_inputs["prompt_token_ids"] = dec_token_ids - + else: if "multi_modal_data" in decoder_inputs: raise ValueError("Multi-modal decoder inputs of encoder-" "decoder models are not supported yet") - else: - assert_never(encoder_inputs) # type: ignore[arg-type] + + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + decoder_inputs["prompt_token_ids"]) + decoder_inputs["prompt_token_ids"] = dec_token_ids return EncoderDecoderInputs( encoder=encoder_inputs, decoder=decoder_inputs, ) - def _separate_enc_dec_inputs_from_mm_processor_outputs( + def _split_enc_dec_mm_inputs( self, - inputs: SingletonInputs, + inputs: Union[SingletonInputs, MultiModalEncDecInputs], decoder_inputs_to_override: Optional[SingletonInputs] = None, ) -> tuple[SingletonInputs, SingletonInputs]: """ For encoder/decoder models only: Separate Encoder/Decoder inputs from a MultiModalEncDecInputs """ + if (inputs["type"] == "embeds" or decoder_inputs_to_override + and decoder_inputs_to_override["type"] == "embeds"): + raise ValueError("Embedding inputs are not supported for encoder-" + "decoder models") + + # Needed for mypy + inputs = cast( + Union[TokenInputs, MultiModalInputs, MultiModalEncDecInputs], + inputs, + ) + decoder_inputs_to_override = cast( + Optional[Union[TokenInputs, MultiModalInputs]], + decoder_inputs_to_override, + ) + encoder_inputs: SingletonInputs decoder_inputs: SingletonInputs - if inputs["type"] == "multimodal": - # Multimodal data inputs - assert ("encoder_prompt" in inputs - and "encoder_prompt_token_ids" in inputs) + + if inputs["type"] == "multimodal": # Multimodal data inputs + if not ("encoder_prompt" in inputs + and "encoder_prompt_token_ids" in inputs): + raise RuntimeError("You should register an encoder-decoder " + "multi-modal processor for encoder-decoder " + "models.") inputs = cast(MultiModalEncDecInputs, inputs) + encoder_inputs = token_inputs( prompt=inputs["encoder_prompt"], prompt_token_ids=inputs["encoder_prompt_token_ids"], ) - if decoder_inputs_to_override is not None: - decoder_inputs = MultiModalInputs( - type="multimodal", - prompt=decoder_inputs_to_override.get("prompt", ""), - prompt_token_ids=decoder_inputs_to_override[ - "prompt_token_ids"], - mm_kwargs=inputs["mm_kwargs"], - mm_hashes=inputs["mm_hashes"], - mm_placeholders=inputs["mm_placeholders"], - ) - else: - decoder_inputs = MultiModalInputs( - type="multimodal", - prompt=inputs["prompt"], - prompt_token_ids=inputs["prompt_token_ids"], - mm_kwargs=inputs["mm_kwargs"], - mm_hashes=inputs["mm_hashes"], - mm_placeholders=inputs["mm_placeholders"], - ) - cache_salt = inputs.get("cache_salt") - if cache_salt is not None: + decoder_prompt_inputs = decoder_inputs_to_override or inputs + decoder_inputs = MultiModalInputs( + type="multimodal", + prompt=decoder_prompt_inputs.get("prompt", ""), + prompt_token_ids=decoder_prompt_inputs["prompt_token_ids"], + mm_kwargs=inputs["mm_kwargs"], + mm_hashes=inputs["mm_hashes"], + mm_placeholders=inputs["mm_placeholders"], + ) + if cache_salt := inputs.get("cache_salt"): decoder_inputs["cache_salt"] = cache_salt - elif inputs["type"] == "token": - # Text-only inputs + elif inputs["type"] == "token": # Text-only inputs encoder_inputs = token_inputs(prompt="", prompt_token_ids=[]) decoder_inputs = decoder_inputs_to_override or inputs else: assert_never(inputs) # type: ignore[arg-type] + return encoder_inputs, decoder_inputs def _process_encoder_decoder_prompt( @@ -541,8 +691,8 @@ class InputPreprocessor: # with explicit decoder prompt. if self.model_config.is_multimodal_model: encoder_inputs, decoder_inputs = ( - self._separate_enc_dec_inputs_from_mm_processor_outputs( - encoder_inputs, decoder_inputs)) + self._split_enc_dec_mm_inputs(encoder_inputs, + decoder_inputs)) else: inputs = self._prompt_to_llm_inputs( prompt, @@ -551,11 +701,9 @@ class InputPreprocessor: if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model encoder_inputs, decoder_inputs = ( - self._separate_enc_dec_inputs_from_mm_processor_outputs( - inputs)) + self._split_enc_dec_mm_inputs(inputs)) else: encoder_inputs = inputs - decoder_inputs = None return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) @@ -591,8 +739,8 @@ class InputPreprocessor: # with explicit decoder prompt. if self.model_config.is_multimodal_model: encoder_inputs, decoder_inputs = ( - self._separate_enc_dec_inputs_from_mm_processor_outputs( - encoder_inputs, decoder_inputs)) + self._split_enc_dec_mm_inputs(encoder_inputs, + decoder_inputs)) else: inputs = await self._prompt_to_llm_inputs_async( prompt, @@ -601,11 +749,9 @@ class InputPreprocessor: if self.model_config.is_multimodal_model: # Encoder-Decoder Multimodal model encoder_inputs, decoder_inputs = ( - self._separate_enc_dec_inputs_from_mm_processor_outputs( - inputs)) + self._split_enc_dec_mm_inputs(inputs)) else: encoder_inputs = inputs - decoder_inputs = None return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) @@ -615,14 +761,13 @@ class InputPreprocessor: prompt_inputs: DecoderOnlyInputs, prompt_adapter_request: Optional[PromptAdapterRequest], ) -> DecoderOnlyInputs: - if (prompt_inputs["type"] == "token" - or prompt_inputs["type"] == "multimodal"): + if "prompt_token_ids" in prompt_inputs: + prompt_inputs = cast(Union[TokenInputs, MultiModalInputs], + prompt_inputs) # Needed for mypy prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter( prompt_inputs["prompt_token_ids"], prompt_adapter_request=prompt_adapter_request, ) - else: - assert_never(prompt_inputs) # type: ignore[arg-type] return prompt_inputs diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..e539335d4dc73 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 62614a59cbe9a..238808b226f43 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -71,8 +71,8 @@ def single_marlin_moe( E = w.shape[0] N = w.shape[2] // (num_bits // 2) - topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, - renormalize) + topk_weights, topk_ids, token_expert_indices = fused_topk( + hidden_states, gating_output, topk, renormalize) # This might not be an optimal config for a single MMM get_config_func = functools.partial(try_get_optimal_moe_config, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index a209715ede77c..c1edbda0dd224 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -854,7 +854,7 @@ def fused_topk( gating_output: torch.Tensor, topk: int, renormalize: bool, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: assert hidden_states.shape[0] == gating_output.shape[0], ( "Number of tokens mismatch") @@ -868,20 +868,19 @@ def fused_topk( topk, dtype=torch.int32, device=hidden_states.device) - token_expert_indicies = torch.empty(M, - topk, - dtype=torch.int32, - device=hidden_states.device) + token_expert_indices = torch.empty(M, + topk, + dtype=torch.int32, + device=hidden_states.device) gating_output_float = gating_output.float() # TODO(woosuk): Optimize this. topk_func = dispatch_topk_func() topk_weights, topk_ids = topk_func(topk_weights, topk_ids, - token_expert_indicies, + token_expert_indices, gating_output_float, renormalize) - del token_expert_indicies # Not used. Will be used in the future. - return topk_weights, topk_ids + return topk_weights, topk_ids, token_expert_indices # This is used by the Deepseek-V2 and Deepseek-V3 model @@ -1510,8 +1509,8 @@ def fused_moe( topk, renormalize, num_expert_group, topk_group) elif custom_routing_function is None: - topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, - renormalize) + topk_weights, topk_ids, token_expert_indices = fused_topk( + hidden_states, gating_output, topk, renormalize) else: topk_weights, topk_ids = custom_routing_function( hidden_states, gating_output, topk, renormalize) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3cdf3c97a7d3e..35994c8ac6af0 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -801,10 +801,11 @@ class FusedMoE(torch.nn.Module): scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias) elif custom_routing_function is None: - topk_weights, topk_ids = fused_topk(hidden_states=hidden_states, - gating_output=router_logits, - topk=top_k, - renormalize=renormalize) + topk_weights, topk_ids, token_expert_indices = fused_topk( + hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize) else: topk_weights, topk_ids = custom_routing_function( hidden_states=hidden_states, diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py new file mode 100644 index 0000000000000..cdf7e31c1436e --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional, Tuple + +import torch + + +def moe_permute( + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + token_expert_indices: torch.Tensor, + topk: int, + n_expert: int, + n_local_expert: int, + expert_map: Optional[torch.Tensor] = None, + align_block_size: Optional[int] = None, + fill_invalid_expert: int = -1 +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + This function expands and permutes activation to gather uncontinuous tokens + for each expert. + Parameters: + - hidden_states (torch.Tensor): The input tensor to the MoE layer. + - topk_weights (torch.Tensor): topk expert route weight for each token. + - topk_ids (torch.Tensor): topk expert route id for each token. + - token_expert_indices (torch.Tensor): indice for expanded hidden. + - topk (int): The number of top-k experts to select. + - n_expert (int): The number of expert. + - n_local_expert (int): The number of expert in current EP rank. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - align_block_size (Optional[int]): align group gemm block size for deepgemm + - fill_invalid_expert(int): fill expert id in m_indices for invalid expert + to workaround DeepGemm unsupported -1 in m_indices + Returns: + - permuted_hidden_states (torch.Tensor): permuted activation. + - expert_first_token_offset (torch.Tensor): offset of the first token + of each expert for standard grouped gemm. if enable 'align_block_size' + expert_first_token_offset will align up to 'align_block_size'. + - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute. + - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records + the group which the j-th row of the LHS belong to.` + """ + n_token, n_hidden = hidden_states.shape + assert (n_hidden * hidden_states.element_size() + ) % 16 == 0, "permue kernel need hidden dim align to 16B" + permuted_row_size = n_token * topk + if align_block_size is not None: + permuted_row_size = (permuted_row_size + n_expert * + (align_block_size - 1) + align_block_size - + 1) // align_block_size * align_block_size + + permuted_hidden_states = torch.empty( + (permuted_row_size, n_hidden), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + m_indices = torch.full((permuted_row_size, ), + fill_invalid_expert, + dtype=torch.int32, + device=hidden_states.device) + expert_first_token_offset = torch.empty(n_local_expert + 1, + dtype=torch.int64, + device=hidden_states.device) + src_row_id2dst_row_id_map = torch.empty((n_token, topk), + dtype=torch.int32, + device=hidden_states.device) + torch.ops._moe_C.moe_permute(hidden_states, topk_weights, topk_ids, + token_expert_indices, expert_map, n_expert, + n_local_expert, topk, align_block_size, + permuted_hidden_states, + expert_first_token_offset, + src_row_id2dst_row_id_map, m_indices) + return (permuted_hidden_states, expert_first_token_offset, + src_row_id2dst_row_id_map, m_indices) + + +def moe_unpermute( + permuted_hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + src_row_id2dst_row_id_map: torch.Tensor, + expert_first_token_offset: torch.Tensor, + topk: int, + n_expert: int, + n_local_expert: int, +) -> torch.Tensor: + """ + This function expands and permutes activation to gathering uncontinuous + tokens for each expert. + Parameters: + - permuted_hidden_states (torch.Tensor): permuted activation. + - topk_weights (torch.Tensor): topk expert route weight for each token. + - topk_ids (torch.Tensor): topk expert route id for each token. + - expert_first_token_offset (torch.Tensor): offset of the first token + of each expert for grouped gemm. + - topk (int): The number of top-k experts to select. + - n_expert (int): The number of expert. + - n_local_expert (int): The number of expert in current EP rank. + Returns: + - hidden_states (torch.Tensor): The reduced and unpermuted activation + tensor. + """ + n_token, n_hidden = topk_weights.shape[0], permuted_hidden_states.shape[-1] + assert (n_hidden * permuted_hidden_states.element_size() + ) % 16 == 0, "unpermue kernel need hidden dim align to 16B" + hidden_states = torch.empty((n_token, n_hidden), + dtype=permuted_hidden_states.dtype, + device=permuted_hidden_states.device) + + torch.ops._moe_C.moe_unpermute(permuted_hidden_states, topk_weights, + topk_ids, src_row_id2dst_row_id_map, + expert_first_token_offset, n_expert, + n_local_expert, topk, hidden_states) + return hidden_states diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 07d928b597bad..f7c885c2baa31 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -140,7 +140,7 @@ class AWQMarlinConfig(QuantizationConfig): from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) if not check_moe_marlin_supports_layer(layer, self.group_size): - logger.warning_one( + logger.warning_once( f"Layer '{prefix}' is not supported by AWQMoeMarlin. " "Falling back to Moe WNA16 kernels.") return MoeWNA16Config.from_config( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 721e36af2b28f..ae16a20cfaab9 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -34,6 +34,7 @@ __all__ = [ "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod", "CompressedTensorsW8A8Fp8MoECutlassMethod", + "CompressedTensorsW8A8Int8MoEMethod", "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod", ] @@ -71,6 +72,8 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config) elif quant_config._is_fp8_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Fp8MoEMethod(quant_config) + elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant): + return CompressedTensorsW8A8Int8MoEMethod(quant_config) else: raise RuntimeError( f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}") @@ -545,6 +548,138 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod): ) +class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): + + def __init__( + self, + quant_config: "CompressedTensorsConfig" # type: ignore # noqa E501 + ): + self.quant_config = quant_config + self.weight_quant = self.quant_config.target_scheme_map["Linear"].get( + "weights") + self.input_quant = self.quant_config.target_scheme_map["Linear"].get( + "input_activations") + + per_channel = ( + self.weight_quant.strategy == QuantizationStrategy.CHANNEL + and self.input_quant.strategy == QuantizationStrategy.TOKEN) + if not per_channel: + raise ValueError( + "For INT8 Fused MoE layers, we require channelwise, " + "dynamic per token quantization. Found " + f"{self.weight_quant}, {self.input_quant}") + + self.static_input_scales = not self.input_quant.dynamic + if self.static_input_scales: + raise ValueError( + "For INT8 Fused MoE layers, we require channelwise, " + "dynamic per token quantization. Found static input scales.") + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + + params_dtype = torch.int8 + + # WEIGHTS + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w13_weight", w13_weight) + set_weight_attrs(w13_weight, extra_weight_attrs) + + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), + requires_grad=False) + layer.register_parameter("w2_weight", w2_weight) + set_weight_attrs(w2_weight, extra_weight_attrs) + + # WEIGHT_SCALES + assert self.weight_quant.strategy == QuantizationStrategy.CHANNEL + w13_weight_scale = torch.nn.Parameter(torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts, + hidden_size, + 1, + dtype=torch.float32), + requires_grad=False) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-CHANNEL quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # INPUT_SCALES + assert not self.static_input_scales + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + pass + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + ) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe import fused_experts + + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias) + + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + use_int8_w8a8=True, + per_channel_quant=True, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale) + + class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): def __init__( diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index 6ee3a2f1bbbb0..b06c9579d63db 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -134,7 +134,7 @@ class GPTQBitBLASConfig(QuantizationConfig): @classmethod def get_min_capability(cls) -> int: - return 70 + return 80 @classmethod def get_config_filenames(cls) -> List[str]: diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index c7f9d95f4c2df..703d54b3bee6a 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -157,7 +157,7 @@ class GPTQMarlinConfig(QuantizationConfig): from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) if not check_moe_marlin_supports_layer(layer, self.group_size): - logger.warning_one( + logger.warning_once( f"Layer '{prefix}' is not supported by GPTQMoeMarlin. " "Falling back to Moe WNA16 kernels.") return MoeWNA16Config.from_config( diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 2bf21a05c46d9..047724129522b 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -111,7 +111,7 @@ class CutlassScaledMMLinearKernel(ScaledMMLinearKernel): # * dynamic, i_s is None and x_s computed from x. # * static, i_s is scalar and x_s is i_s. symmetric = azp_adj is None - x_q, x_s, x_zp = ops.scaled_int8_quant(x, + x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(), i_s, i_zp, symmetric=symmetric) diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py index 5d28d327e8a2f..e26ac4ea3d4c7 100644 --- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py +++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py @@ -71,6 +71,15 @@ def _check_bitblas_supported( f"Only group_sizes = {BITBLAS_SUPPORTED_GROUP_SIZES} " "are supported.") + # Finally, check if bitblas is installed + try: + import bitblas + if bitblas.__version__ < MINIMUM_BITBLAS_VERSION: + raise ImportError("bitblas version is wrong. Please " + f"install bitblas>={MINIMUM_BITBLAS_VERSION}") + except ImportError: + return False, "BitBLAS is not installed." + return True, None diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index 98b06b6c2ae96..aaaf7a9e0a4c2 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -85,6 +85,32 @@ def block_dequant( return x_dq_block +if current_platform.is_rocm(): + from triton.language import core + + # NOTE: This can be removed when hip.libdevice.round() is available. + @core.extern + def round_f32(arg0, _builder=None): + return core.extern_elementwise("", + "", [arg0], { + (core.dtype("fp32"), ): + ("llvm.round", core.dtype("fp32")), + (core.dtype("fp64"), ): + ("llvm.round", core.dtype("fp64")), + }, + is_pure=True, + _builder=_builder) + + @triton.jit + def round_int8(x): + return round_f32(x).to(tl.int8) +else: + + @triton.jit + def round_int8(x): + return tl.extra.cuda.libdevice.round(x).to(tl.int8) + + @triton.jit def _per_token_quant_int8( x_ptr, @@ -106,7 +132,7 @@ def _per_token_quant_int8( absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10) scale_x = absmax / 127 x_q = x * (127 / absmax) - x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8) + x_q = round_int8(x_q) tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask) tl.store(scale_ptr + row_id, scale_x) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 1ee1332ac45ec..9368992b24fe0 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -110,6 +110,11 @@ class SamplerOutput( # 'broadcasted' to all other PP ranks for next step. sampled_token_ids_cpu: Optional[torch.Tensor] = None + # On-device tensor containing the sampled token embeddings (embeddings + # corresponding to the sampled token ids). Used when prompt embeddings are + # specified in lieu of prompt token ids or text. + sampled_token_embeds: Optional[torch.Tensor] = None + # Spec decode metrics populated by workers. spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None @@ -183,7 +188,7 @@ class Sampler(nn.Module): # Whether or not the SamplerOutput should have on-device tensors # containing the sampled token ids and probabilities. This is used by - # speculative decoding. + # speculative decoding and when prompt embeddings are specified. self.include_gpu_probs_tensor = False self.should_modify_greedy_probs_inplace = False diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index cb9100e355945..01f75db9ee862 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -384,6 +384,7 @@ class DefaultModelLoader(BaseModelLoader): weights_iterator = pt_weights_iterator( hf_weights_files, self.load_config.use_tqdm_on_load, + self.load_config.pt_load_map_location, ) if current_platform.is_tpu(): @@ -890,6 +891,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): iterator = pt_weights_iterator( hf_weights_files, self.load_config.use_tqdm_on_load, + self.load_config.pt_load_map_location, ) for org_name, param in iterator: # mapping weight names from transformers to vllm while preserving diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 37a8491cf63d5..10bc55ca5f7d4 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -502,6 +502,7 @@ def fastsafetensors_weights_iterator( def pt_weights_iterator( hf_weights_files: List[str], use_tqdm_on_load: bool, + pt_load_map_location: Union[str, dict[str, str]] = "cpu", ) -> Generator[Tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model bin/pt files.""" for bin_file in tqdm( @@ -510,7 +511,9 @@ def pt_weights_iterator( disable=not enable_tqdm(use_tqdm_on_load), bar_format=_BAR_FORMAT, ): - state = torch.load(bin_file, map_location="cpu", weights_only=True) + state = torch.load(bin_file, + map_location=pt_load_map_location, + weights_only=True) yield from state.items() del state diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index dfe8f20c70d62..c518efdb54f89 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -175,10 +175,8 @@ class ArcticMoE(nn.Module): # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) do_normalize = self.top_k > 1 - topk_weights, topk_ids = fused_topk(hidden_states, - router_logits, - self.top_k, - renormalize=do_normalize) + topk_weights, topk_ids, token_expert_indices = fused_topk( + hidden_states, router_logits, self.top_k, renormalize=do_normalize) # topk_ids: (num_tokens, k) if self.is_quant: if 2 * num_tokens <= self.num_experts: diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index ffa5840b46041..ce86b9b2c4f04 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -454,9 +454,7 @@ class DeepseekV2MLAAttention(nn.Module): qk_head_dim=self.qk_head_dim, v_head_dim=self.v_head_dim, rotary_emb=self.rotary_emb, - q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, kv_b_proj=self.kv_b_proj, - o_proj=self.o_proj, ) self.prefix = prefix @@ -468,17 +466,22 @@ class DeepseekV2MLAAttention(nn.Module): hidden_states: torch.Tensor, ) -> torch.Tensor: if self.q_lora_rank is not None: - ckq = self.q_a_proj(hidden_states)[0] - hidden_states_or_q_c = self.q_a_layernorm(ckq) + q_c = self.q_a_proj(hidden_states)[0] + q_c = self.q_a_layernorm(q_c) + q = self.q_b_proj(q_c)[0] else: - hidden_states_or_q_c = hidden_states + q = self.q_proj(hidden_states)[0] kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) - return self.mla_attn(hidden_states_or_q_c, - kv_c_normed, - k_pe, - output_shape=hidden_states.shape) + + attn_out = self.mla_attn( + q, + kv_c_normed, + k_pe, + output_shape=(hidden_states.shape[0], + self.num_local_heads * self.v_head_dim)) + return self.o_proj(attn_out)[0] class DeepseekV2DecoderLayer(nn.Module): diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index c42f19fee17dd..904ff3210943a 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -6,7 +6,8 @@ import torch import torch.nn as nn from transformers import LlamaConfig -from vllm.config import ModelConfig, VllmConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear @@ -76,17 +77,19 @@ class LlamaDecoderLayer(LlamaDecoderLayer): return hidden_states, residual +@support_torch_compile class LlamaModel(nn.Module): def __init__( self, *, - model_config: ModelConfig, + vllm_config: VllmConfig, start_layer_id: int = 0, prefix: str = "", ) -> None: super().__init__() - self.config = model_config.hf_config + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size self.embed_tokens = VocabParallelEmbedding( self.config.vocab_size, @@ -119,8 +122,7 @@ class LlamaModel(nn.Module): hidden_states: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: input_embeds = self.embed_tokens(input_ids) - if (hidden_states.shape[-1] != input_embeds.shape[-1]): - hidden_states = self.fc(hidden_states) + assert hidden_states.shape[-1] == input_embeds.shape[-1] residual = None hidden_states, residual = self.layers[0]( @@ -169,9 +171,9 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): nn.Module.__init__(self) - model_config = vllm_config.speculative_config.draft_model_config - self.config = model_config.hf_config - self.model = LlamaModel(model_config=model_config, + self.config = vllm_config. \ + speculative_config.draft_model_config.hf_config + self.model = LlamaModel(vllm_config=vllm_config, start_layer_id=start_layer_id, prefix="model") @@ -214,6 +216,13 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): logits_new[:, targets] = logits return logits_new + def combine_hidden_states( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + # combine multiple auxiliary hidden states returned by eagle3 + return self.model.fc(hidden_states) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 16f5327ee79b9..3791b92ecc2a0 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -354,9 +354,8 @@ class PixtralHFMultiModalProcessor( image_token_id = hf_config.image_token_index image_end_id = vocab[processor.image_end_token] - vision_config = hf_config.vision_config - assert isinstance(vision_config, PixtralVisionConfig) - encoder_info = PixtralHFEncoderInfo(vision_config) + assert isinstance(hf_config.vision_config, PixtralVisionConfig) + encoder_info = PixtralHFEncoderInfo(hf_config) def get_replacement(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index c9abe4142be57..6352ba236818f 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -272,12 +272,8 @@ class Mistral3MultiModalProcessor( image_token_id = hf_config.image_token_index image_end_id = vocab[processor.image_end_token] - vision_config = hf_config.vision_config - assert isinstance(vision_config, PixtralVisionConfig) - # Need to sneak in spatial_merge_size for Mistral3 - vision_config.spatial_merge_size = getattr(hf_config, - "spatial_merge_size", 1) - encoder_info = PixtralHFEncoderInfo(vision_config) + assert isinstance(hf_config.vision_config, PixtralVisionConfig) + encoder_info = PixtralHFEncoderInfo(hf_config) def get_replacement(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index d756b3b8a7caa..7b11a616e585d 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -916,8 +916,9 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): return self.vision_config.image_size def get_patch_size(self) -> int: - return (self.vision_config.patch_size * - self.vision_config.spatial_merge_size) + # spatial_merge_size is needed for Mistral3 + spatial_merge_size = getattr(self.hf_config, "spatial_merge_size", 1) + return self.vision_config.patch_size * spatial_merge_size def get_patch_grid_length(self) -> int: image_size, patch_size = self.get_image_size(), self.get_patch_size() diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 05e3b3f3ccdf3..901d83ec5b9e6 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -19,10 +19,11 @@ _C = TypeVar("_C", bound=PretrainedConfig) class VisionEncoderInfo(ABC, Generic[_C]): - def __init__(self, vision_config: _C) -> None: + def __init__(self, hf_config: _C) -> None: super().__init__() - self.vision_config = vision_config + self.hf_config = hf_config + self.vision_config = hf_config.vision_config @abstractmethod def get_num_image_tokens( @@ -57,18 +58,14 @@ def get_vision_encoder_info( from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig from .siglip import SiglipEncoderInfo, SiglipVisionConfig - vision_config = hf_config.vision_config - if isinstance(vision_config, CLIPVisionConfig): - return CLIPEncoderInfo(vision_config) - if isinstance(vision_config, PixtralVisionConfig): - # Need to sneak in spatial_merge_size for Mistral3 - vision_config.spatial_merge_size = getattr(hf_config, - "spatial_merge_size", 1) - return PixtralHFEncoderInfo(vision_config) - if isinstance(vision_config, SiglipVisionConfig): - return SiglipEncoderInfo(vision_config) + if isinstance(hf_config.vision_config, CLIPVisionConfig): + return CLIPEncoderInfo(hf_config) + if isinstance(hf_config.vision_config, PixtralVisionConfig): + return PixtralHFEncoderInfo(hf_config) + if isinstance(hf_config.vision_config, SiglipVisionConfig): + return SiglipEncoderInfo(hf_config) - msg = f"Unsupported vision config: {type(vision_config)}" + msg = f"Unsupported vision config: {type(hf_config.vision_config)}" raise NotImplementedError(msg) diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index 11665ef667538..53e289370a9f4 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -31,16 +31,20 @@ class MultiModalHasher: return obj.encode("utf-8") if isinstance(obj, bytes): return obj - if isinstance(obj, Image.Image): - return obj.tobytes() - - # Convertible to NumPy arrays - if isinstance(obj, torch.Tensor): - obj = obj.numpy() if isinstance(obj, (int, float)): - obj = np.array(obj) + return np.array(obj).tobytes() + + if isinstance(obj, Image.Image): + return cls.item_to_bytes("image", np.array(obj.convert("RGBA"))) + if isinstance(obj, torch.Tensor): + return cls.item_to_bytes("tensor", obj.numpy()) if isinstance(obj, np.ndarray): - return obj.tobytes() + return cls.item_to_bytes( + "ndarray", { + "dtype": obj.dtype.str, + "shape": obj.shape, + "data": obj.data.tobytes(), + }) logger.warning( "No serialization method found for %s. " @@ -53,14 +57,22 @@ class MultiModalHasher: cls, key: str, obj: object, + ) -> bytes: + return b''.join(kb + vb for kb, vb in cls.iter_item_to_bytes(key, obj)) + + @classmethod + def iter_item_to_bytes( + cls, + key: str, + obj: object, ) -> Iterable[tuple[bytes, bytes]]: # Recursive cases if isinstance(obj, (list, tuple)): for i, elem in enumerate(obj): - yield from cls.item_to_bytes(f"{key}.{i}", elem) + yield from cls.iter_item_to_bytes(f"{key}.{i}", elem) elif isinstance(obj, dict): for k, v in obj.items(): - yield from cls.item_to_bytes(f"{key}.{k}", v) + yield from cls.iter_item_to_bytes(f"{key}.{k}", v) else: key_bytes = cls.serialize_item(key) value_bytes = cls.serialize_item(obj) @@ -71,7 +83,7 @@ class MultiModalHasher: hasher = blake3() for k, v in kwargs.items(): - for k_bytes, v_bytes in cls.item_to_bytes(k, v): + for k_bytes, v_bytes in cls.iter_item_to_bytes(k, v): hasher.update(k_bytes) hasher.update(v_bytes) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index e8745a8f1f901..58168d0e850c2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1670,15 +1670,17 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): placeholders = mm_placeholders.get(modality, []) if len(placeholders) != item_count: + # NOTE: If you are a model developer, this can also arise from + # an inconsistency between `_call_hf_processor` and + # `_get_mm_fields_config` implementations raise RuntimeError( f"Expected there to be {item_count} prompt updates " f"corresponding to {item_count} {modality} items, but " f"instead found {len(placeholders)} prompt updates! " - "Either the prompt text has missing/incorrect tokens for " - "multi-modal inputs, or there is a problem with your " - "implementation of merged multi-modal processor for this " - "model (usually arising from an inconsistency between " - "`_call_hf_processor` and `_get_prompt_updates`).") + "This is likely because you forgot to include input " + "placeholder tokens (e.g., ``, `<|image_pad|>`) " + "in the prompt. If the model has a chat template, make " + "sure you have applied it before calling `LLM.generate`.") def _maybe_apply_prompt_updates( self, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index c5555aba1a3e3..6a78e00a90495 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -406,12 +406,12 @@ class Platform: """Raises if this request is unsupported on this platform""" def __getattr__(self, key: str): - device = getattr(torch, self.device_name, None) + device = getattr(torch, self.device_type, None) if device is not None and hasattr(device, key): return getattr(device, key) else: logger.warning("Current platform %s does not have '%s'" \ - " attribute.", self.device_name, key) + " attribute.", self.device_type, key) return None @classmethod diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index de097ab9af1b9..ff63f9656c01b 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -58,6 +58,15 @@ _ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { "excessive use of shared memory. If this happens, disable Triton FA " "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") } +_ROCM_DEVICE_ID_NAME_MAP: Dict[str, str] = { + "0x74a0": "AMD_Instinct_MI300A", + "0x74a1": "AMD_Instinct_MI300X", + "0x74b5": "AMD_Instinct_MI300X", # MI300X VF + "0x74a5": "AMD_Instinct_MI325X", + "0x74b9": "AMD_Instinct_MI325X", # MI325X VF + "0x74a9": "AMD_Instinct_MI300X_HF", + "0x74bd": "AMD_Instinct_MI300X_HF", +} # Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES`` if "HIP_VISIBLE_DEVICES" in os.environ: @@ -225,7 +234,11 @@ class RocmPlatform(Platform): def get_device_name(cls, device_id: int = 0) -> str: physical_device_id = device_id_to_physical_device_id(device_id) handle = amdsmi_get_processor_handles()[physical_device_id] - return amdsmi_get_gpu_asic_info(handle)["market_name"] + asic_info = amdsmi_get_gpu_asic_info(handle) + device_name: str = asic_info["device_id"] + if device_name in _ROCM_DEVICE_ID_NAME_MAP: + return _ROCM_DEVICE_ID_NAME_MAP[device_name] + return asic_info["market_name"] @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index d5923557a2112..9c95e6d3fa087 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Optional, Union import torch +from tpu_info import device import vllm.envs as envs from vllm.inputs import ProcessorInputs, PromptType @@ -54,7 +55,8 @@ class TpuPlatform(Platform): @classmethod def get_device_name(cls, device_id: int = 0) -> str: - return "tpu" + chip_type, _ = device.get_local_chips() + return f"TPU {chip_type.name}" @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 3ac5c5c3daab5..66a77681be9a3 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -186,9 +186,9 @@ class SamplingParams( logits_processors: list of functions that modify logits based on previously generated tokens, and optionally prompt tokens as a first argument. - truncate_prompt_tokens: If set to -1, will use the truncation size - supported by the model. If set to an integer k, will use only - the last k tokens from the prompt (i.e., left truncation). + truncate_prompt_tokens: If set to -1, will use the truncation size + supported by the model. If set to an integer k, will use only + the last k tokens from the prompt (i.e., left truncation). Defaults to None (i.e., no truncation). guided_decoding: If provided, the engine will construct a guided decoding logits processor from these parameters. Defaults to None. diff --git a/vllm/sequence.py b/vllm/sequence.py index a97409523c945..5bc9b8a6fc82a 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -166,6 +166,9 @@ class SequenceData(msgspec.Struct, _output_token_ids: array = msgspec.field( default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, [])) + _prompt_embeds: Optional[torch.Tensor] = None + _output_embeds: Optional[torch.Tensor] = None + ### The below fields should not be passed as an argument ### _cumulative_logprob: float = 0.0 _prompt_token_ids_tuple: tuple[int, @@ -176,6 +179,7 @@ class SequenceData(msgspec.Struct, _num_cached_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL _cached_all_token_ids: list[int] = msgspec.field(default_factory=list) + _cached_all_token_embeds: Optional[torch.Tensor] = None # It is used to get delta input. It is reset when `get_delta_and_reset` # is called. @@ -208,6 +212,8 @@ class SequenceData(msgspec.Struct, def from_seqs( prompt_token_ids: GenericSequence[int], output_token_ids: Optional[GenericSequence[int]] = None, + *, + prompt_embeds: Optional[torch.Tensor] = None, ) -> "SequenceData": """ Construct a :class:`SequenceData` instance from prompt and output @@ -217,13 +223,15 @@ class SequenceData(msgspec.Struct, prompt_token_ids) if output_token_ids is None: - return SequenceData(prompt_token_ids_arr) + return SequenceData(prompt_token_ids_arr, + _prompt_embeds=prompt_embeds) output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE, output_token_ids) return SequenceData(prompt_token_ids_arr, - _output_token_ids=output_token_ids_arr) + _output_token_ids=output_token_ids_arr, + _prompt_embeds=prompt_embeds) def __post_init__(self) -> None: assert self._prompt_token_ids.typecode == "l" @@ -231,6 +239,8 @@ class SequenceData(msgspec.Struct, self._prompt_token_ids_tuple: tuple[int, ...] = tuple( self._prompt_token_ids) self._update_cached_all_tokens() + if self._prompt_embeds is not None: + self._update_cached_all_token_embeds() def _update_cached_all_tokens(self): assert isinstance(self._prompt_token_ids, array) @@ -238,6 +248,13 @@ class SequenceData(msgspec.Struct, self._cached_all_token_ids: list[int] = list(self._prompt_token_ids + self._output_token_ids) + def _update_cached_all_token_embeds(self): + assert isinstance(self._prompt_embeds, torch.Tensor) + self._cached_all_token_embeds: torch.Tensor = self._prompt_embeds + if self._output_embeds is not None: + self._cached_all_token_embeds = torch.cat( + (self._cached_all_token_embeds, self._output_embeds), dim=0) + @property def cumulative_logprob(self) -> float: return self._cumulative_logprob @@ -270,6 +287,15 @@ class SequenceData(msgspec.Struct, new_output_token_ids) self._update_cached_all_tokens() + @property + def output_embeds(self) -> Optional[torch.Tensor]: + return self._output_embeds + + @output_embeds.setter + def output_embeds(self, new_output_token_embeds: torch.Tensor) -> None: + self._output_token_embeds = new_output_token_embeds + self._update_cached_all_token_embeds() + @property def output_token_ids_array(self) -> array: """Return the prompt token ids in array type. @@ -280,6 +306,15 @@ class SequenceData(msgspec.Struct, assert isinstance(self._output_token_ids, array) return self._output_token_ids + @property + def prompt_embeds(self) -> Optional[torch.Tensor]: + return self._prompt_embeds + + @prompt_embeds.setter + def prompt_embeds(self, prompt_embeds: torch.Tensor) -> None: + self._prompt_embeds = prompt_embeds + self._update_cached_all_token_embeds() + @property def mrope_position_delta(self) -> Optional[int]: return self._mrope_position_delta @@ -288,11 +323,28 @@ class SequenceData(msgspec.Struct, def mrope_position_delta(self, new_mrope_position_delta): self._mrope_position_delta = new_mrope_position_delta - def append_token_id(self, token_id: int, logprob: float) -> None: + def append_token_id(self, + token_id: int, + logprob: float, + token_embed: Optional[torch.Tensor] = None) -> None: self._output_token_ids.append(token_id) self._new_appended_tokens.append(token_id) self._cached_all_token_ids.append(token_id) self._cumulative_logprob += logprob + if token_embed is not None: + # Do not pass in with batch or sequence dimensions + assert token_embed.ndim == 1 + token_embed = token_embed.detach().cpu().unsqueeze(0) + if self._output_embeds is None: + self._output_embeds = token_embed + else: + self._output_embeds = torch.cat( + (self._output_embeds, token_embed), dim=0) + assert self._cached_all_token_embeds is not None + self._cached_all_token_embeds = torch.cat( + (self._cached_all_token_embeds, + token_embed.to(device=self._cached_all_token_embeds.device)), + dim=0) def get_len(self) -> int: return len(self._output_token_ids) + len(self._prompt_token_ids) @@ -306,6 +358,9 @@ class SequenceData(msgspec.Struct, def get_token_ids(self) -> list[int]: return self._cached_all_token_ids + def get_token_embeddings(self) -> Optional[torch.Tensor]: + return self._cached_all_token_embeds + def get_prefix_token_ids( self, num_tokens: int ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]: @@ -387,6 +442,8 @@ class SequenceData(msgspec.Struct, def __repr__(self) -> str: return (f"SequenceData(" f"prompt_token_ids={self._prompt_token_ids}, " + f"prompt_embeds.shape=" + f"{getattr(self._prompt_embeds, 'shape', None)}, " f"output_token_ids={self.output_token_ids}, " f"cumulative_logprob={self.cumulative_logprob}, " f"get_num_computed_tokens={self.get_num_computed_tokens()})") @@ -425,7 +482,10 @@ class Sequence: self.lora_request = lora_request self.prompt_adapter_request = prompt_adapter_request - self.data = SequenceData.from_seqs(self.prompt_token_ids) + self.data = SequenceData.from_seqs( + self.prompt_token_ids, + prompt_embeds=self.inputs["prompt_embeds"] + if self.inputs["type"] == "embeds" else None) self.output_logprobs: SampleLogprobs = [] self.output_text = "" @@ -448,14 +508,20 @@ class Sequence: @property def prompt(self) -> Optional[str]: + if self.inputs["type"] == "embeds": + return None return self.inputs.get("prompt") @property def prompt_token_ids(self) -> list[int]: + if self.inputs["type"] == "embeds": + return [0] * len(self.inputs["prompt_embeds"]) return self.inputs["prompt_token_ids"] @property def token_type_ids(self) -> list[int]: + if self.inputs["type"] == "embeds": + return [] return self.inputs.get("token_type_ids", []) @property @@ -554,11 +620,14 @@ class Sequence: """Reset the sequence states for recomputation.""" self.data.reset_state_for_recompute() - def append_token_id(self, token_id: int, logprobs: dict[int, - Logprob]) -> None: + def append_token_id(self, + token_id: int, + logprobs: dict[int, Logprob], + token_embed: Optional[torch.Tensor] = None) -> None: assert token_id in logprobs self.output_logprobs.append(logprobs) - self.data.append_token_id(token_id, logprobs[token_id].logprob) + self.data.append_token_id(token_id, logprobs[token_id].logprob, + token_embed) def get_len(self) -> int: return self.data.get_len() @@ -889,6 +958,10 @@ class SequenceGroup: f"sampling_params={self.sampling_params}, " f"num_seqs={len(self.seqs)})") + def uses_prompt_embeds(self) -> bool: + """Returns True if the sequence group uses input embeds.""" + return any(seq.data.prompt_embeds is not None for seq in self.seqs) + class SequenceGroupMetadataDelta( msgspec.Struct, @@ -1043,10 +1116,14 @@ class SequenceOutput( parent_seq_id: int output_token: int logprobs: dict[int, Logprob] + output_embed: Optional[torch.Tensor] = None def __repr__(self) -> str: + output_embed_shape = \ + self.output_embed.shape if self.output_embed is not None else None return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, " f"output_token={self.output_token}, " + f"output_embed.shape={output_embed_shape}" f"logprobs={self.logprobs})") def __eq__(self, other: object) -> bool: diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 24095ef2a5675..a6276c5633945 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -201,6 +201,9 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): if self.prompt_adapter_config is not None: raise ValueError("TP1DraftModelRunner has no support for " "prompt_adapter_config") + if model_input.inputs_embeds is not None: + raise ValueError("TP1DraftModelRunner has no support for " + "inputs_embeds") if model_input.multi_modal_kwargs: raise ValueError( "TP1DraftModelRunner has no support for multi_modal_kwargs" @@ -242,9 +245,16 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): # Get model if use_cuda_graph: - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = (self.graph_runners[model_input.virtual_engine] - [graph_batch_size]) + if model_input.inputs_embeds is None: + graph_batch_size = model_input.input_tokens.shape[0] + model_executable = ( + self.graph_runners[model_input.virtual_engine][( + graph_batch_size, False)]) + else: + graph_batch_size = model_input.inputs_embeds.shape[0] + model_executable = ( + self.graph_runners[model_input.virtual_engine][( + graph_batch_size, True)]) if previous_hidden_states is not None: hidden_states = torch.cat([ @@ -281,6 +291,7 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): self.vllm_config): hidden_states = model_executable( input_ids=model_input.input_tokens, + inputs_embeds=None, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs(multi_modal_kwargs, diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 1146606e9a13d..de57403d1b50e 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -282,7 +282,8 @@ class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase): else: count += 1 - seq.append_token_id(token_id, token_logprob.logprob) + seq.append_token_id(token_id, token_logprob.logprob, + seq_output.output_embed) seq.update_num_computed_tokens(1) @staticmethod diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index fa29efbf6b2d4..0a0c0a4bd1786 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import sys import types from importlib.util import find_spec @@ -45,9 +44,4 @@ if not HAS_TRITON: super().__init__("triton.language") self.constexpr = None self.dtype = None - - sys.modules['triton'] = TritonPlaceholder() - sys.modules['triton.language'] = TritonLanguagePlaceholder() - -if 'triton' in sys.modules: - logger.info("Triton module has been replaced with a placeholder.") + self.int64 = None diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 217dcd7c33ac4..f986d797f2b03 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -10,9 +10,11 @@ from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType, is_quantized_kv_cache) +from vllm.attention.layer import Attention from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8, get_flash_attn_version) +from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import cdiv @@ -273,13 +275,23 @@ def make_local_attention_virtual_batches( block_table_local +def _get_sliding_window_configs( + vllm_config: VllmConfig) -> set[Optional[tuple[int, int]]]: + """Get the set of all sliding window configs used in the model.""" + sliding_window_configs: set[Optional[tuple[int, int]]] = set() + layers = get_layers_from_vllm_config(vllm_config, Attention) + for layer in layers.values(): + assert isinstance(layer.impl, FlashAttentionImpl) + sliding_window_configs.add(layer.impl.sliding_window) + return sliding_window_configs + + class FlashAttentionMetadataBuilder: def __init__(self, runner: "GPUModelRunner"): model_config = runner.model_config self.runner = runner - self.aot_schedule = (get_flash_attn_version() == 3) self.num_heads_q = model_config.get_num_attention_heads( runner.parallel_config) self.num_heads_kv = model_config.get_num_kv_heads( @@ -287,6 +299,11 @@ class FlashAttentionMetadataBuilder: self.headdim = model_config.get_head_size() self.page_size = self.runner.block_size + self.aot_schedule = (get_flash_attn_version() == 3) + # Sliding window size to be used with the AOT scheduler will be + # populated on first build() call. + self.aot_sliding_window: Optional[tuple[int, int]] = None + def reorder_batch(self, input_batch: "InputBatch", scheduler_output: "SchedulerOutput") -> bool: return False @@ -304,6 +321,22 @@ class FlashAttentionMetadataBuilder: slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to( self.runner.device, non_blocking=True).long() + if self.aot_sliding_window is None: + self.aot_sliding_window = (-1, -1) + # For the AOT scheduler we need the sliding window value to be + # constant for all layers to. We have to populate this on the first + # build() call so the layers are constructed (cannot populate) + # in __init__. + if self.aot_schedule: + sliding_window_configs = _get_sliding_window_configs( + self.runner.vllm_config) + if len(sliding_window_configs) == 1: + sliding_window_config = sliding_window_configs.pop() + if sliding_window_config is not None: + self.aot_sliding_window = sliding_window_config + elif len(sliding_window_configs) > 1: + self.aot_schedule = False + def schedule(batch_size, cu_query_lens, max_query_len, seqlens, max_seq_len, causal): if self.aot_schedule: @@ -318,6 +351,7 @@ class FlashAttentionMetadataBuilder: page_size=self.page_size, cu_seqlens_q=cu_query_lens, causal=causal, + window_size=self.aot_sliding_window, ) return None diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index fd3be901f4c38..3e77555d7f942 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -200,7 +200,7 @@ from vllm.attention.ops.merge_attn_states import merge_attn_states from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, - LinearBase, RowParallelLinear, + LinearBase, UnquantizedLinearMethod) from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform @@ -597,12 +597,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): qk_head_dim: int, v_head_dim: int, rotary_emb: RotaryEmbedding, - # q_proj should be q_b_proj if q_lora_rank is not None, but from an - # attention backend perspective we rely on the layer to pass in the - # correct matrix - q_proj: ColumnParallelLinear, kv_b_proj: ColumnParallelLinear, - o_proj: RowParallelLinear, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -625,9 +620,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): if current_platform.is_cuda(): self.rotary_emb = rotary_emb.forward_cuda - self.q_proj = q_proj self.kv_b_proj = kv_b_proj - self.o_proj = o_proj self.vllm_flash_attn_version = get_flash_attn_version() # Handle the differences between the flash_attn_varlen from flash_attn @@ -684,27 +677,13 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): return attn_out, lse return attn_out - def _v_up_proj_and_o_proj(self, x): + def _v_up_proj(self, x): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) # Multiply (N, B, L) x (N, L, V) -> (N, B, V) x = torch.bmm(x, self.W_UV) # Convert from (N, B, V) to (B, N * V) - x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) - return self.o_proj(x)[0] - - # Return `ql_nope`, `q_pe` - def _q_proj_and_k_up_proj(self, x): - q_nope, q_pe = self.q_proj(x)[0]\ - .view(-1, self.num_heads, self.qk_head_dim)\ - .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - - # Convert from (B, N, P) to (N, B, P) - q_nope = q_nope.transpose(0, 1) - # Multiply (N, B, P) x (N, P, L) -> (N, B, L) - ql_nope = torch.bmm(q_nope, self.W_UK_T) - # Convert from (N, B, L) to (B, N, L) - return ql_nope.transpose(0, 1), q_pe + return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) def process_weights_after_loading(self, act_dtype: torch.dtype): @@ -874,7 +853,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): suffix_lse=suffix_lse, ) - return self.o_proj(output.flatten(start_dim=-2))[0] + return output.flatten(start_dim=-2) @abstractmethod def _forward_decode( @@ -889,7 +868,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): def forward( self, layer: AttentionLayer, - hidden_states_or_q_c: torch.Tensor, # query in unified attn + q: torch.Tensor, k_c_normed: torch.Tensor, # key in unified attn k_pe: torch.Tensor, # value in unified attn kv_cache: torch.Tensor, @@ -908,7 +887,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): # Inputs and outputs may be padded for CUDA graphs output_padded = output output = output[:num_actual_toks, ...] - hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...] + q = q[:num_actual_toks, ...] k_c_normed = k_c_normed[:num_actual_toks, ...] k_pe = k_pe[:num_actual_toks, ...] @@ -923,24 +902,29 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): has_prefill = attn_metadata.num_prefills > 0 num_decode_tokens = attn_metadata.num_decode_tokens - decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] + q = q.view(-1, self.num_heads, self.qk_head_dim) + decode_q = q[:num_decode_tokens] decode_k_pe = k_pe[:num_decode_tokens] - prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:] + prefill_q = q[num_decode_tokens:] prefill_k_pe = k_pe[num_decode_tokens:] prefill_k_c_normed = k_c_normed[num_decode_tokens:] if has_decode: assert attn_metadata.decode is not None - decode_ql_nope, decode_q_pe = \ - self._q_proj_and_k_up_proj(decode_hs_or_q_c) + decode_q_nope, decode_q_pe = decode_q.split( + [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + # Convert from (B, N, P) to (N, B, P) + decode_q_nope = decode_q_nope.transpose(0, 1) + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + decode_ql_nope = decode_ql_nope.transpose(0, 1) decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( attn_metadata.decode.input_positions, decode_q_pe, decode_k_pe) if has_prefill: assert attn_metadata.prefill is not None - prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\ - .view(-1, self.num_heads, self.qk_head_dim) prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:] prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb( diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 143bfe35bb5e5..f18c9c8b6462c 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -146,4 +146,4 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): causal=True, ) - return self._v_up_proj_and_o_proj(o) + return self._v_up_proj(o) diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 8e7e4f10b81b8..2e6b619db6287 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -115,4 +115,4 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): attn_metadata.decode.seq_lens, attn_logits, num_kv_splits, self.scale, PAGE_SIZE) - return self._v_up_proj_and_o_proj(o) + return self._v_up_proj(o) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 802fe566ae2ed..c3f8a4908fda1 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -775,7 +775,10 @@ class Scheduler(SchedulerInterface): # Return the cached request data to the queue so they can be reused. for req_data in scheduler_output.scheduled_cached_reqs: - self._cached_reqs_data[req_data.req_id].append(req_data) + # NOTE(rob): since we free stopped reqs above, adding stopped reqs + # to _cached_reqs_data will cause a memory leak. + if req_data.req_id not in self.finished_req_ids: + self._cached_reqs_data[req_data.req_id].append(req_data) self.running = new_running engine_core_outputs = EngineCoreOutputs( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 81508c2e069ba..07097d7da68f8 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -10,6 +10,7 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader.loader import get_model_loader from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.models import ModelRegistry +from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.sample.metadata import SamplingMetadata @@ -39,11 +40,9 @@ class EagleProposer: self.hidden_size = vllm_config.model_config.get_hidden_size() - # TODO: make eagle3 compatible with cudagraph - self.use_cuda_graph = self.method != 'eagle3' and \ - (self.vllm_config.compilation_config.level - == CompilationLevel.PIECEWISE and - not self.vllm_config.model_config.enforce_eager) + self.use_cuda_graph = (self.vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE and + not self.vllm_config.model_config.enforce_eager) self.cudagraph_batch_sizes = list( reversed( @@ -90,6 +89,12 @@ class EagleProposer: batch_size = next_token_ids.shape[0] last_token_indices = cu_num_tokens[1:] - 1 + if self.method == "eagle3": + assert isinstance(self.model, Eagle3LlamaForCausalLM) + target_hidden_states = self.model.combine_hidden_states( + target_hidden_states) + assert target_hidden_states.shape[-1] == self.hidden_size + # Shift the input ids by one token. # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3] self.input_ids[:num_tokens - 1] = target_token_ids[1:] @@ -126,12 +131,7 @@ class EagleProposer: # copy inputs to buffer for cudagraph self.positions[:num_tokens] = target_positions - if self.method == 'eagle': - self.hidden_states[:num_tokens] = target_hidden_states - hidden_states = self.hidden_states - else: - # TODO: make eagle3 compatible with cuda graph - hidden_states = target_hidden_states + self.hidden_states[:num_tokens] = target_hidden_states with set_forward_context(attn_metadata, self.vllm_config, @@ -139,7 +139,7 @@ class EagleProposer: last_hidden_states, hidden_states = self.model( input_ids=self.input_ids[:num_input_tokens], positions=self.positions[:num_input_tokens], - hidden_states=hidden_states[:num_input_tokens], + hidden_states=self.hidden_states[:num_input_tokens], ) sample_hidden_states = last_hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states, None) @@ -209,10 +209,7 @@ class EagleProposer: self.input_ids[:batch_size] = input_ids self.positions[:batch_size] = clamped_positions - if self.method == 'eagle': - # TODO: make eagle3 compatible with cudagraph. - self.hidden_states[:batch_size] = hidden_states - hidden_states = self.hidden_states + self.hidden_states[:batch_size] = hidden_states # Run the model. with set_forward_context(attn_metadata, @@ -221,7 +218,7 @@ class EagleProposer: last_hidden_states, hidden_states = self.model( input_ids=self.input_ids[:input_batch_size], positions=self.positions[:input_batch_size], - hidden_states=hidden_states[:input_batch_size], + hidden_states=self.hidden_states[:input_batch_size], ) hidden_states = hidden_states[:batch_size] logits = self.model.compute_logits(last_hidden_states[:batch_size], @@ -314,12 +311,11 @@ class EagleProposer: ) -> None: with set_forward_context(None, self.vllm_config, num_tokens=num_tokens): - if self.method == 'eagle': - self.model( - input_ids=self.input_ids[:num_tokens], - positions=self.positions[:num_tokens], - hidden_states=self.hidden_states[:num_tokens], - ) + self.model( + input_ids=self.input_ids[:num_tokens], + positions=self.positions[:num_tokens], + hidden_states=self.hidden_states[:num_tokens], + ) # NOTE(woosuk): Currently, the below code is not used and we always use argmax diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 4df192a8727c3..4864163b0de2a 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -49,6 +49,7 @@ class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, + "inputs_embeds": self.inputs_embeds, "input_positions": self.input_positions, "encoder_input_tokens": self.encoder_input_tokens, "encoder_input_positions": self.encoder_input_positions, @@ -172,10 +173,17 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): if (model_input.attn_metadata is not None and model_input.attn_metadata.prefill_metadata is None and model_input.attn_metadata.decode_metadata.use_cuda_graph): - assert model_input.input_tokens is not None - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = self.graph_runners[ - model_input.virtual_engine][graph_batch_size] + if model_input.inputs_embeds is None: + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] + model_executable = ( + self.graph_runners[model_input.virtual_engine][( + graph_batch_size, False)]) + else: + graph_batch_size = model_input.inputs_embeds.shape[0] + model_executable = ( + self.graph_runners[model_input.virtual_engine][( + graph_batch_size, True)]) else: model_executable = self.model @@ -189,6 +197,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): model_input.virtual_engine): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, + inputs_embeds=model_input.inputs_embeds, positions=model_input.input_positions, encoder_input_ids=model_input.encoder_input_tokens, encoder_positions=model_input.encoder_input_positions, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 73e0eff9a8b7e..e22bbcc656ff4 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -35,7 +35,8 @@ from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata, SamplingMetadataCache from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, + get_sampler) from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.models import supports_lora, supports_multimodal @@ -85,6 +86,7 @@ class ModelInputForGPU(ModelRunnerInputBase): additional fields. """ input_tokens: Optional[torch.Tensor] = None + inputs_embeds: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None token_types: Optional[torch.Tensor] = None seq_lens: Optional[List[int]] = None @@ -105,6 +107,7 @@ class ModelInputForGPU(ModelRunnerInputBase): def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, + "inputs_embeds": self.inputs_embeds, "input_positions": self.input_positions, "lora_requests": self.lora_requests, "lora_mapping": self.lora_mapping, @@ -155,6 +158,7 @@ class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, + "inputs_embeds": self.inputs_embeds, "input_positions": self.input_positions, "lora_requests": self.lora_requests, "lora_mapping": self.lora_mapping, @@ -194,6 +198,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): def simple_reinit(self): self.input_tokens[0].clear() # type: ignore + self.inputs_embeds = None # type: ignore self.input_positions[0].clear() # type: ignore self.token_types[0].clear() # type: ignore self.mrope_input_positions = None # type: ignore @@ -221,6 +226,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): # Input tokens and positions. input_tokens: Optional[List[List[int]]] = None, + inputs_embeds: Optional[torch.Tensor] = None, input_positions: Optional[List[List[int]]] = None, token_types: Optional[List[List[int]]] = None, mrope_input_positions: Optional[List[List[List[int]]]] = None, @@ -282,6 +288,8 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): for seq_id in range(len(self.seq_ids)): self.input_tokens[seq_id].clear() + self.inputs_embeds = inputs_embeds + if input_positions: self.input_positions = input_positions else: @@ -356,6 +364,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): else: self.input_tokens = input_tokens or [] + self.inputs_embeds = inputs_embeds self.input_positions = input_positions or [] self.token_types = token_types or [] self.mrope_input_positions = mrope_input_positions or None @@ -401,6 +410,26 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): self.lora_index_mapping = [] self.lora_prompt_mapping = [] + def __repr__(self) -> str: + return (f"InterDataForSeqGroup(" + f"request_id={self.request_id}, " + f"seq_ids={self.seq_ids}, " + f"is_prompt={self.is_prompt}, " + f"block_tables={self.block_tables}, " + f"computed_block_nums={self.computed_block_nums}, " + f"n_seqs={self.n_seqs}, " + f"input_tokens={self.input_tokens}, " + f"inputs_embeds.shape=" + f"{getattr(self.inputs_embeds, 'shape', None)}, " + f"input_positions={self.input_positions}, " + f"token_types={self.token_types}, " + f"mrope_input_positions={self.mrope_input_positions}, " + f"seq_lens={self.seq_lens}, " + f"orig_seq_lens={self.orig_seq_lens}, " + f"query_lens={self.query_lens}, " + f"context_lens={self.context_lens}, " + f"multi_modal_kwargs={self.multi_modal_kwargs}") + def gen_inter_data_builder(self, num_seqs: int): return lambda: ModelInputForGPUBuilder.InterDataForSeqGroup( request_id="", @@ -511,13 +540,21 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): context_len = seq_data.get_num_computed_tokens() # Compute tokens. - tokens = seq_data.get_token_ids()[context_len:seq_len] + if seq_data.prompt_embeds is None: + tokens = seq_data.get_token_ids()[context_len:seq_len] + prompt_embeds = None + else: + tokens = [0] * (seq_len - context_len) + prompt_embeds = seq_data.get_token_embeddings( + )[context_len:seq_len] + token_types = seq_group_metadata.token_type_ids inter_data.seq_lens[seq_idx] = seq_len inter_data.orig_seq_lens[seq_idx] = seq_len inter_data.context_lens[seq_idx] = context_len inter_data.input_tokens[seq_idx].extend(tokens) + inter_data.inputs_embeds = prompt_embeds inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) inter_data.token_types[seq_idx].extend( token_types if token_types else []) @@ -822,15 +859,29 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): create on-device tensors. """ # Combine and flatten intermediate data. - input_tokens = [] - token_types = [] + input_tokens = list[int]() + inputs_embeds_lst = list[torch.Tensor]() + token_types = list[int]() for inter_data in self.inter_data_list: for cur_input_tokens in inter_data.input_tokens: input_tokens.extend(cur_input_tokens) for cur_token_types in inter_data.token_types: token_types.extend(cur_token_types) + if inter_data.inputs_embeds is not None: + inputs_embeds_lst.append( + inter_data.inputs_embeds.to( + dtype=self.runner.model_config.dtype, + device=self.runner.device)) + inputs_embeds: Optional[torch.Tensor] + if len(inputs_embeds_lst) == 0: + inputs_embeds = None + else: + inputs_embeds = torch.cat(inputs_embeds_lst, dim=0).to( + dtype=self.runner.model_config.dtype, + device=self.runner.device) + assert len(inputs_embeds) == len(input_tokens) - if not input_tokens: + if not input_tokens and inputs_embeds is None: # This may happen when all prefill requests hit # prefix caching and there is no decode request. return self.model_input_cls() @@ -980,6 +1031,7 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): return self.model_input_cls( input_tokens=input_tokens_tensor, + inputs_embeds=inputs_embeds, input_positions=input_positions_tensor, token_types=token_types_tensor, attn_metadata=attn_metadata, @@ -1029,7 +1081,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): self.max_batchsize_to_capture = \ self.vllm_config.compilation_config.max_capture_size - self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [ + # + self.graph_runners: List[Dict[Tuple[int, bool], CUDAGraphRunner]] = [ {} for _ in range(self.parallel_config.pipeline_parallel_size) ] self.graph_memory_pool: Optional[Tuple[ @@ -1466,6 +1519,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): input_positions = torch.zeros(max_batch_size, dtype=torch.long, device=self.device) + inputs_embeds = torch.zeros( + (max_batch_size, self.model_config.get_hidden_size()), + dtype=self.model_config.dtype, + device=self.device) if self.model_config.uses_mrope: input_positions = torch.tile(input_positions, (3, 1)).cuda(device=self.device) @@ -1503,15 +1560,24 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): # memory usage of CUDA graph. for virtual_engine in range( self.parallel_config.pipeline_parallel_size): - # Only rank 0 should print progress bar during capture - cudagraph_capture_sizes = (tqdm( - self.vllm_config.compilation_config. + # We need to not only iterate over batch sizes, but also whether + # to use inputs_embeds or not, hence we use the cartesian + # product. + cudagraph_capture_sizes = self.vllm_config.compilation_config\ + .cudagraph_capture_sizes + cudagraph_inputs_embeds = (( + True, False) if self.model_config.enable_prompt_embeds else + (False, )) + compilation_cases = itertools.product( cudagraph_capture_sizes, - desc="Capturing CUDA graph shapes", - ) if get_tensor_model_parallel_rank() == 0 else - self.vllm_config.compilation_config. - cudagraph_capture_sizes) - for batch_size in cudagraph_capture_sizes: + cudagraph_inputs_embeds, + ) + # Only rank 0 should print progress bar during capture + if get_tensor_model_parallel_rank() == 0: + compilation_cases = tqdm( + list(compilation_cases), + desc="Capturing CUDA graph shapes") + for batch_size, use_inputs_embeds in compilation_cases: attn_metadata = ( self.attn_state.graph_capture_get_metadata_for_batch( batch_size, @@ -1542,6 +1608,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): capture_inputs = { "input_ids": input_tokens[:batch_size], + "inputs_embeds": + inputs_embeds[:batch_size] + if use_inputs_embeds else None, "positions": input_positions[..., :batch_size], "intermediate_inputs": @@ -1578,8 +1647,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): virtual_engine): graph_runner.capture(**capture_inputs) self.graph_memory_pool = graph_runner.graph.pool() - self.graph_runners[virtual_engine][batch_size] = ( - graph_runner) + self.graph_runners[virtual_engine][( + batch_size, use_inputs_embeds)] = graph_runner if self.lora_config: self._remove_dummy_loras() @@ -1711,8 +1780,9 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): if prefill_meta is None and decode_meta.use_cuda_graph: assert model_input.input_tokens is not None graph_batch_size = model_input.input_tokens.shape[0] - model_executable = self.graph_runners[virtual_engine][ - graph_batch_size] + use_inputs_embeds = model_input.inputs_embeds is not None + model_executable = self.graph_runners[virtual_engine][( + graph_batch_size, use_inputs_embeds)] if previous_hidden_states is not None: previous_hidden_states = torch.cat([ previous_hidden_states, @@ -1763,6 +1833,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): self.vllm_config, virtual_engine): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, + inputs_embeds=model_input.inputs_embeds, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs(multi_modal_kwargs, @@ -1817,6 +1888,11 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): model_input.async_callback() # Sample the next token. + assert isinstance(self.sampler, Sampler) + orig_include_gpu_probs_tensor = self.sampler.include_gpu_probs_tensor + if model_input.inputs_embeds is not None: + self.sampler.include_gpu_probs_tensor = True + output: SamplerOutput = self.sampler( logits=logits, sampling_metadata=model_input.sampling_metadata, @@ -1838,6 +1914,18 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): output.model_forward_time = (orig_model_forward_time + model_forward_time) + if model_input.inputs_embeds is not None: + self.sampler.include_gpu_probs_tensor = \ + orig_include_gpu_probs_tensor + if output.sampled_token_ids is not None: + output.sampled_token_embeds = self.model.get_input_embeddings( + output.sampled_token_ids.squeeze(1)) + + for token_embed, sequence_group_output in zip( + output.sampled_token_embeds, output.outputs): + assert len(sequence_group_output.samples) == 1 + sequence_group_output.samples[0].output_embed = token_embed + if self.return_hidden_states: # we only need to pass hidden states of most recent token assert model_input.sampling_metadata is not None @@ -1931,6 +2019,7 @@ class CUDAGraphRunner(nn.Module): def capture( self, input_ids: torch.Tensor, + inputs_embeds: Optional[torch.Tensor], positions: torch.Tensor, intermediate_inputs: Optional[IntermediateTensors], kv_caches: List[torch.Tensor], @@ -1947,6 +2036,7 @@ class CUDAGraphRunner(nn.Module): for _ in range(_NUM_WARMUP_ITERS): self.model( input_ids=input_ids, + inputs_embeds=inputs_embeds, positions=positions, intermediate_tensors=intermediate_inputs, **kwargs, @@ -1959,6 +2049,9 @@ class CUDAGraphRunner(nn.Module): with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream): output_hidden_or_intermediate_states = self.model( input_ids=input_ids, + **({ + "inputs_embeds": inputs_embeds, + } if inputs_embeds is not None else {}), positions=positions, intermediate_tensors=intermediate_inputs, **kwargs, @@ -1986,6 +2079,9 @@ class CUDAGraphRunner(nn.Module): self.input_buffers = { "input_ids": input_ids, + **({ + "inputs_embeds": inputs_embeds, + } if inputs_embeds is not None else {}), "positions": positions, "kv_caches": @@ -2006,6 +2102,7 @@ class CUDAGraphRunner(nn.Module): def forward( self, input_ids: torch.Tensor, + inputs_embeds: Optional[torch.Tensor], positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors], **kwargs, @@ -2020,6 +2117,9 @@ class CUDAGraphRunner(nn.Module): # so the shape is not padded, we need to copy partial only self.input_buffers["positions"][:positions.shape[0]].copy_( positions, non_blocking=True) + if inputs_embeds is not None: + self.input_buffers["inputs_embeds"][:inputs_embeds.shape[0]].copy_( + inputs_embeds, non_blocking=True) if self.backend_name != "NO_ATTENTION": self.input_buffers["slot_mapping"].copy_( diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index cbd5e2060cad5..fdb7353f2f9ce 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -84,10 +84,17 @@ class PoolingModelRunner( # explore how to leverage it. if (prefill_meta is None and decode_meta is not None and decode_meta.use_cuda_graph): - assert model_input.input_tokens is not None - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = self.graph_runners[virtual_engine][ - graph_batch_size] + if model_input.inputs_embeds is None: + assert model_input.input_tokens is not None + graph_batch_size = model_input.input_tokens.shape[0] + model_executable = ( + self.graph_runners[model_input.virtual_engine][( + graph_batch_size, False)]) + else: + graph_batch_size = model_input.inputs_embeds.shape[0] + model_executable = ( + self.graph_runners[model_input.virtual_engine][( + graph_batch_size, True)]) else: model_executable = self.model